diff --git a/.rat-excludes b/.rat-excludes index 236c2db05367..9165872b9fb2 100644 --- a/.rat-excludes +++ b/.rat-excludes @@ -93,3 +93,5 @@ INDEX .lintr gen-java.* .*avpr +org.apache.spark.sql.sources.DataSourceRegister +.*parquet diff --git a/CHANGES.txt b/CHANGES.txt new file mode 100644 index 000000000000..95f80d808488 --- /dev/null +++ b/CHANGES.txt @@ -0,0 +1,24649 @@ +Spark Change Log +---------------- + +Release 1.5.0 + + [SPARK-10143] [SQL] Use parquet's block size (row group size) setting as the min split size if necessary. + Yin Huai + 2015-08-21 14:30:00 -0700 + Commit: 14c8c0c, github.com/apache/spark/pull/8346 + + [SPARK-9864] [DOC] [MLlib] [SQL] Replace since in scaladoc to Since annotation + MechCoder + 2015-08-21 14:19:24 -0700 + Commit: e7db876, github.com/apache/spark/pull/8352 + + [SPARK-10122] [PYSPARK] [STREAMING] Fix getOffsetRanges bug in PySpark-Streaming transform function + jerryshao + 2015-08-21 13:10:11 -0700 + Commit: 4e72839, github.com/apache/spark/pull/8347 + + [SPARK-10130] [SQL] type coercion for IF should have children resolved first + Daoyuan Wang + 2015-08-21 12:21:51 -0700 + Commit: 817c38a, github.com/apache/spark/pull/8331 + + [SPARK-9846] [DOCS] User guide for Multilayer Perceptron Classifier + Alexander Ulanov + 2015-08-20 20:02:27 -0700 + Commit: e5e6017, github.com/apache/spark/pull/8262 + + [SPARK-10140] [DOC] add target fields to @Since + Xiangrui Meng + 2015-08-20 20:01:13 -0700 + Commit: 04ef52a, github.com/apache/spark/pull/8344 + + Preparing development version 1.5.1-SNAPSHOT + Patrick Wendell + 2015-08-20 16:24:12 -0700 + Commit: 988e838 + + Preparing Spark release v1.5.0-rc1 + Patrick Wendell + 2015-08-20 16:24:07 -0700 + Commit: 4c56ad7 + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-20 15:33:10 -0700 + Commit: 175c1d9 + + Preparing Spark release v1.5.0-rc1 + Patrick Wendell + 2015-08-20 15:33:04 -0700 + Commit: d837d51 + + [SPARK-9245] [MLLIB] LDA topic assignments + Joseph K. Bradley + 2015-08-20 15:01:31 -0700 + Commit: 2beea65, github.com/apache/spark/pull/8329 + + [SPARK-10108] Add since tags to mllib.feature + MechCoder + 2015-08-20 14:56:08 -0700 + Commit: 560ec12, github.com/apache/spark/pull/8309 + + [SPARK-10138] [ML] move setters to MultilayerPerceptronClassifier and add Java test suite + Xiangrui Meng + 2015-08-20 14:47:04 -0700 + Commit: 2e0d2a9, github.com/apache/spark/pull/8342 + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-20 12:43:13 -0700 + Commit: eac31ab + + Preparing Spark release v1.5.0-rc1 + Patrick Wendell + 2015-08-20 12:43:08 -0700 + Commit: 99eeac8 + + [SPARK-10126] [PROJECT INFRA] Fix typo in release-build.sh which broke snapshot publishing for Scala 2.11 + Josh Rosen + 2015-08-20 11:31:03 -0700 + Commit: 6026f4f, github.com/apache/spark/pull/8325 + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-20 11:06:41 -0700 + Commit: a1785e3 + + Preparing Spark release v1.5.0-rc1 + Patrick Wendell + 2015-08-20 11:06:31 -0700 + Commit: 19b92c8 + + [SPARK-10136] [SQL] Fixes Parquet support for Avro array of primitive array + Cheng Lian + 2015-08-20 11:00:24 -0700 + Commit: 2f47e09, github.com/apache/spark/pull/8341 + + [SPARK-9982] [SPARKR] SparkR DataFrame fail to return data of Decimal type + Alex Shkurenko + 2015-08-20 10:16:38 -0700 + Commit: a7027e6, github.com/apache/spark/pull/8239 + + [MINOR] [SQL] Fix sphinx warnings in PySpark SQL + MechCoder + 2015-08-20 10:05:31 -0700 + Commit: 257e9d7, github.com/apache/spark/pull/8171 + + [SPARK-10100] [SQL] Eliminate hash table lookup if there is no grouping key in aggregation. + Reynold Xin + 2015-08-20 07:53:27 -0700 + Commit: 5be5175, github.com/apache/spark/pull/8332 + + [SPARK-10092] [SQL] Backports #8324 to branch-1.5 + Yin Huai + 2015-08-20 18:43:24 +0800 + Commit: 675e224, github.com/apache/spark/pull/8336 + + [SPARK-10128] [STREAMING] Used correct classloader to deserialize WAL data + Tathagata Das + 2015-08-19 21:15:58 -0700 + Commit: 71aa547, github.com/apache/spark/pull/8328 + + [SPARK-10125] [STREAMING] Fix a potential deadlock in JobGenerator.stop + zsxwing + 2015-08-19 19:43:09 -0700 + Commit: 63922fa, github.com/apache/spark/pull/8326 + + [SPARK-10124] [MESOS] Fix removing queued driver in mesos cluster mode. + Timothy Chen + 2015-08-19 19:43:26 -0700 + Commit: a3ed2c3, github.com/apache/spark/pull/8322 + + [SPARK-9812] [STREAMING] Fix Python 3 compatibility issue in PySpark Streaming and some docs + zsxwing + 2015-08-19 18:36:01 -0700 + Commit: 16414da, github.com/apache/spark/pull/8315 + + [SPARK-9242] [SQL] Audit UDAF interface. + Reynold Xin + 2015-08-19 17:35:41 -0700 + Commit: 321cb99, github.com/apache/spark/pull/8321 + + [SPARK-9895] User Guide for RFormula Feature Transformer + Eric Liang + 2015-08-19 15:43:08 -0700 + Commit: 56a37b0, github.com/apache/spark/pull/8293 + + [SPARK-6489] [SQL] add column pruning for Generate + Wenchen Fan + 2015-08-19 15:04:56 -0700 + Commit: 5c749c8, github.com/apache/spark/pull/8268 + + [SPARK-10119] [CORE] Fix isDynamicAllocationEnabled when config is expliticly disabled. + Marcelo Vanzin + 2015-08-19 14:33:32 -0700 + Commit: a59475f, github.com/apache/spark/pull/8316 + + [SPARK-10083] [SQL] CaseWhen should support type coercion of DecimalType and FractionalType + Daoyuan Wang + 2015-08-19 14:31:51 -0700 + Commit: 1494d58, github.com/apache/spark/pull/8270 + + [SPARK-9899] [SQL] Disables customized output committer when speculation is on + Cheng Lian + 2015-08-19 14:15:28 -0700 + Commit: b32a31d, github.com/apache/spark/pull/8317 + + [SPARK-10090] [SQL] fix decimal scale of division + Davies Liu + 2015-08-19 14:03:47 -0700 + Commit: d9dfd43, github.com/apache/spark/pull/8287 + + [SPARK-9627] [SQL] Stops using Scala runtime reflection in DictionaryEncoding + Cheng Lian + 2015-08-19 13:57:52 -0700 + Commit: 77269fc, github.com/apache/spark/pull/8306 + + [SPARK-10073] [SQL] Python withColumn should replace the old column + Davies Liu + 2015-08-19 13:56:40 -0700 + Commit: afaed7e, github.com/apache/spark/pull/8300 + + [SPARK-10087] [CORE] [BRANCH-1.5] Disable spark.shuffle.reduceLocality.enabled by default. + Yin Huai + 2015-08-19 13:43:46 -0700 + Commit: 829c33a, github.com/apache/spark/pull/8296 + + [SPARK-10107] [SQL] fix NPE in format_number + Davies Liu + 2015-08-19 13:43:04 -0700 + Commit: 1038f67, github.com/apache/spark/pull/8305 + + [SPARK-8918] [MLLIB] [DOC] Add @since tags to mllib.clustering + Xiangrui Meng , Xiaoqing Wang , MechCoder + 2015-08-19 13:17:26 -0700 + Commit: 8c0a5a2, github.com/apache/spark/pull/8256 + + [SPARK-10106] [SPARKR] Add `ifelse` Column function to SparkR + Yu ISHIKAWA + 2015-08-19 12:39:37 -0700 + Commit: ba36925, github.com/apache/spark/pull/8303 + + [SPARK-10097] Adds `shouldMaximize` flag to `ml.evaluation.Evaluator` + Feynman Liang , Joseph K. Bradley + 2015-08-19 11:35:05 -0700 + Commit: f25c324, github.com/apache/spark/pull/8290 + + [SPARK-9856] [SPARKR] Add expression functions into SparkR whose params are complicated + Yu ISHIKAWA + 2015-08-19 10:41:14 -0700 + Commit: a8e8808, github.com/apache/spark/pull/8264 + + [SPARK-10084] [MLLIB] [DOC] Add Python example for mllib FP-growth user guide + Yanbo Liang + 2015-08-19 08:53:34 -0700 + Commit: bebe63d, github.com/apache/spark/pull/8279 + + [SPARK-10060] [ML] [DOC] spark.ml DecisionTree user guide + Joseph K. Bradley + 2015-08-19 07:38:27 -0700 + Commit: f8dc427, github.com/apache/spark/pull/8244 + + [SPARK-8949] Print warnings when using preferred locations feature + Han JU + 2015-08-19 13:04:16 +0100 + Commit: 522b0b6, github.com/apache/spark/pull/7874 + + [SPARK-9977] [DOCS] Update documentation for StringIndexer + lewuathe + 2015-08-19 09:54:03 +0100 + Commit: 5553f02, github.com/apache/spark/pull/8205 + + [DOCS] [SQL] [PYSPARK] Fix typo in ntile function + Moussa Taifi + 2015-08-19 09:42:41 +0100 + Commit: e56bcc6, github.com/apache/spark/pull/8261 + + [SPARK-10070] [DOCS] Remove Guava dependencies in user guides + Sean Owen + 2015-08-19 09:41:09 +0100 + Commit: 561390d, github.com/apache/spark/pull/8272 + + Fix Broken Link + Bill Chambers + 2015-08-19 00:05:01 -0700 + Commit: 417852f, github.com/apache/spark/pull/8302 + + [SPARK-9967] [SPARK-10099] [STREAMING] Renamed conf spark.streaming.backpressure.{enable-->enabled} and fixed deprecated annotations + Tathagata Das + 2015-08-18 23:37:57 -0700 + Commit: 392bd19, github.com/apache/spark/pull/8299 + + [SPARK-9952] Fix N^2 loop when DAGScheduler.getPreferredLocsInternal accesses cacheLocs + Josh Rosen + 2015-08-18 22:30:13 -0700 + Commit: 3ceee55, github.com/apache/spark/pull/8178 + + [SPARK-9508] GraphX Pregel docs update with new Pregel code + Alexander Ulanov + 2015-08-18 22:13:52 -0700 + Commit: 4163926, github.com/apache/spark/pull/7831 + + [SPARK-9705] [DOC] fix docs about Python version + Davies Liu + 2015-08-18 22:11:27 -0700 + Commit: 03a8a88, github.com/apache/spark/pull/8245 + + [SPARK-10093] [SPARK-10096] [SQL] Avoid transformation on executors & fix UDFs on complex types + Reynold Xin , Michael Armbrust + 2015-08-18 22:08:15 -0700 + Commit: 3c33931, github.com/apache/spark/pull/8295 + + [SPARK-10095] [SQL] use public API of BigInteger + Davies Liu + 2015-08-18 20:39:59 -0700 + Commit: 11c9335, github.com/apache/spark/pull/8286 + + [SPARK-10075] [SPARKR] Add `when` expressino function in SparkR + Yu ISHIKAWA + 2015-08-18 20:27:36 -0700 + Commit: ebaeb18, github.com/apache/spark/pull/8266 + + [SPARK-9939] [SQL] Resorts to Java process API in CliSuite, HiveSparkSubmitSuite and HiveThriftServer2 test suites + Cheng Lian + 2015-08-19 11:21:46 +0800 + Commit: bb2fb59, github.com/apache/spark/pull/8168 + + [SPARK-10102] [STREAMING] Fix a race condition that startReceiver may happen before setting trackerState to Started + zsxwing + 2015-08-18 20:15:54 -0700 + Commit: a6f8979, github.com/apache/spark/pull/8294 + + [SPARK-10072] [STREAMING] BlockGenerator can deadlock when the queue of generate blocks fills up to capacity + Tathagata Das + 2015-08-18 19:26:38 -0700 + Commit: 08c5962, github.com/apache/spark/pull/8257 + + [SPARKR] [MINOR] Get rid of a long line warning + Yu ISHIKAWA + 2015-08-18 19:18:05 -0700 + Commit: 0a1385e, github.com/apache/spark/pull/8297 + + Bump SparkR version string to 1.5.0 + Hossein + 2015-08-18 18:02:22 -0700 + Commit: 9b42e24, github.com/apache/spark/pull/8291 + + [SPARK-8473] [SPARK-9889] [ML] User guide and example code for DCT + Feynman Liang + 2015-08-18 17:54:49 -0700 + Commit: 4ee225a, github.com/apache/spark/pull/8184 + + [SPARK-10098] [STREAMING] [TEST] Cleanup active context after test in FailureSuite + Tathagata Das + 2015-08-18 17:00:13 -0700 + Commit: e1b50c7, github.com/apache/spark/pull/8289 + + [SPARK-10012] [ML] Missing test case for Params#arrayLengthGt + lewuathe + 2015-08-18 15:30:23 -0700 + Commit: fb207b2, github.com/apache/spark/pull/8223 + + [SPARK-8924] [MLLIB, DOCUMENTATION] Added @since tags to mllib.tree + Bryan Cutler + 2015-08-18 14:58:30 -0700 + Commit: 56f4da2, github.com/apache/spark/pull/7380 + + [SPARK-10088] [SQL] Add support for "stored as avro" in HiveQL parser. + Marcelo Vanzin + 2015-08-18 14:45:19 -0700 + Commit: 8b0df5a, github.com/apache/spark/pull/8282 + + [SPARK-10089] [SQL] Add missing golden files. + Marcelo Vanzin + 2015-08-18 14:43:05 -0700 + Commit: 74a6b1a, github.com/apache/spark/pull/8283 + + [SPARK-10080] [SQL] Fix binary incompatibility for $ column interpolation + Michael Armbrust + 2015-08-18 13:50:51 -0700 + Commit: 80a6fb5, github.com/apache/spark/pull/8281 + + [SPARK-9574] [STREAMING] Remove unnecessary contents of spark-streaming-XXX-assembly jars + zsxwing + 2015-08-18 13:35:45 -0700 + Commit: 2bccd91, github.com/apache/spark/pull/8069 + + [SPARK-10085] [MLLIB] [DOCS] removed unnecessary numpy array import + Piotr Migdal + 2015-08-18 12:59:28 -0700 + Commit: 9bd2e6f, github.com/apache/spark/pull/8284 + + [SPARK-10032] [PYSPARK] [DOC] Add Python example for mllib LDAModel user guide + Yanbo Liang + 2015-08-18 12:56:36 -0700 + Commit: ec7079f, github.com/apache/spark/pull/8227 + + [SPARK-10029] [MLLIB] [DOC] Add Python examples for mllib IsotonicRegression user guide + Yanbo Liang + 2015-08-18 12:55:36 -0700 + Commit: 80debff, github.com/apache/spark/pull/8225 + + [SPARK-9900] [MLLIB] User guide for Association Rules + Feynman Liang + 2015-08-18 12:53:57 -0700 + Commit: 7ff0e5d, github.com/apache/spark/pull/8207 + + [SPARK-9028] [ML] Add CountVectorizer as an estimator to generate CountVectorizerModel + Yuhao Yang , Joseph K. Bradley + 2015-08-18 11:00:09 -0700 + Commit: b86378c, github.com/apache/spark/pull/7388 + + [SPARK-10007] [SPARKR] Update `NAMESPACE` file in SparkR for simple parameters functions + Yuu ISHIKAWA + 2015-08-18 09:10:59 -0700 + Commit: 20a760a, github.com/apache/spark/pull/8277 + + [SPARK-8118] [SQL] Redirects Parquet JUL logger via SLF4J + Cheng Lian + 2015-08-18 20:15:33 +0800 + Commit: a512250, github.com/apache/spark/pull/8196 + + [MINOR] fix the comments in IndexShuffleBlockResolver + CodingCat + 2015-08-18 10:31:11 +0100 + Commit: 42a0b48, github.com/apache/spark/pull/8238 + + [SPARK-10076] [ML] make MultilayerPerceptronClassifier layers and weights public + Yanbo Liang + 2015-08-17 23:57:02 -0700 + Commit: 40b89c3, github.com/apache/spark/pull/8263 + + [SPARK-10038] [SQL] fix bug in generated unsafe projection when there is binary in ArrayData + Davies Liu + 2015-08-17 23:27:55 -0700 + Commit: e5fbe4f, github.com/apache/spark/pull/8250 + + [MINOR] Format the comment of `translate` at `functions.scala` + Yu ISHIKAWA + 2015-08-17 23:27:11 -0700 + Commit: 2803e8b, github.com/apache/spark/pull/8265 + + [SPARK-7808] [ML] add package doc for ml.feature + Xiangrui Meng + 2015-08-17 19:40:51 -0700 + Commit: 3554250, github.com/apache/spark/pull/8260 + + [SPARK-10059] [YARN] Explicitly add JSP dependencies for tests. + Marcelo Vanzin + 2015-08-17 19:35:35 -0700 + Commit: bfb4c84, github.com/apache/spark/pull/8251 + + [SPARK-9902] [MLLIB] Add Java and Python examples to user guide for 1-sample KS test + jose.cambronero + 2015-08-17 19:09:45 -0700 + Commit: 9740d43, github.com/apache/spark/pull/8154 + + [SPARK-7707] User guide and example code for KernelDensity + Sandy Ryza + 2015-08-17 17:57:51 -0700 + Commit: 5de0ffb, github.com/apache/spark/pull/8230 + + [SPARK-9898] [MLLIB] Prefix Span user guide + Feynman Liang + 2015-08-17 17:53:24 -0700 + Commit: 18b3d11, github.com/apache/spark/pull/8253 + + SPARK-8916 [Documentation, MLlib] Add @since tags to mllib.regression + Prayag Chandran + 2015-08-17 17:26:08 -0700 + Commit: f5ed9ed, github.com/apache/spark/pull/7518 + + [SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for ml.feature.ElementwiseProduct + Yanbo Liang + 2015-08-17 17:25:41 -0700 + Commit: eaeebb9, github.com/apache/spark/pull/8061 + + [SPARK-9974] [BUILD] [SQL] Makes sure com.twitter:parquet-hadoop-bundle:1.6.0 is in SBT assembly jar + Cheng Lian + 2015-08-17 17:25:14 -0700 + Commit: 407175e, github.com/apache/spark/pull/8198 + + [SPARK-8920] [MLLIB] Add @since tags to mllib.linalg + Sameer Abhyankar , Sameer Abhyankar + 2015-08-17 16:00:23 -0700 + Commit: 0f1417b, github.com/apache/spark/pull/7729 + + [SPARK-10068] [MLLIB] Adds links to MLlib types, algos, utilities listing + Feynman Liang + 2015-08-17 15:42:14 -0700 + Commit: bb3bb2a, github.com/apache/spark/pull/8255 + + [SPARK-9592] [SQL] Fix Last function implemented based on AggregateExpression1. + Yin Huai + 2015-08-17 15:30:50 -0700 + Commit: f77eaaf, github.com/apache/spark/pull/8172 + + [SPARK-9526] [SQL] Utilize randomized tests to reveal potential bugs in sql expressions + Yijie Shen + 2015-08-17 14:10:19 -0700 + Commit: 24765cc, github.com/apache/spark/pull/7855 + + [SPARK-10036] [SQL] Load JDBC driver in DataFrameReader.jdbc and DataFrameWriter.jdbc + zsxwing + 2015-08-17 11:53:33 -0700 + Commit: 4daf79f, github.com/apache/spark/pull/8232 + + [SPARK-9950] [SQL] Wrong Analysis Error for grouping/aggregating on struct fields + Wenchen Fan + 2015-08-17 11:36:18 -0700 + Commit: 76390ec, github.com/apache/spark/pull/8222 + + [SPARK-7837] [SQL] Avoids double closing output writers when commitTask() fails + Cheng Lian + 2015-08-18 00:59:05 +0800 + Commit: 7279445, github.com/apache/spark/pull/8236 + + [SPARK-9959] [MLLIB] Association Rules Java Compatibility + Feynman Liang + 2015-08-17 09:58:34 -0700 + Commit: d554bf4, github.com/apache/spark/pull/8206 + + [SPARK-9871] [SPARKR] Add expression functions into SparkR which have a variable parameter + Yu ISHIKAWA + 2015-08-16 23:33:20 -0700 + Commit: 78275c4, github.com/apache/spark/pull/8194 + + [SPARK-10005] [SQL] Fixes schema merging for nested structs + Cheng Lian + 2015-08-16 10:17:58 -0700 + Commit: 90245f6, github.com/apache/spark/pull/8228 + + [SPARK-9973] [SQL] Correct in-memory columnar buffer size + Kun Xu + 2015-08-16 14:44:23 +0800 + Commit: e2c6ef8, github.com/apache/spark/pull/8189 + + [SPARK-10008] Ensure shuffle locality doesn't take precedence over narrow deps + Matei Zaharia + 2015-08-16 00:34:58 -0700 + Commit: fa55c27, github.com/apache/spark/pull/8220 + + [SPARK-8844] [SPARKR] head/collect is broken in SparkR. + Sun Rui + 2015-08-16 00:30:02 -0700 + Commit: 4f75ce2, github.com/apache/spark/pull/7419 + + [SPARK-9805] [MLLIB] [PYTHON] [STREAMING] Added _eventually for ml streaming pyspark tests + Joseph K. Bradley + 2015-08-15 18:48:20 -0700 + Commit: 881baf1, github.com/apache/spark/pull/8087 + + [SPARK-9955] [SQL] correct error message for aggregate + Wenchen Fan + 2015-08-15 14:13:12 -0700 + Commit: 2fda1d8, github.com/apache/spark/pull/8203 + + [SPARK-9980] [BUILD] Fix SBT publishLocal error due to invalid characters in doc + Herman van Hovell + 2015-08-15 10:46:04 +0100 + Commit: 1a6f0af, github.com/apache/spark/pull/8209 + + [SPARK-9725] [SQL] fix serialization of UTF8String across different JVM + Davies Liu + 2015-08-14 22:30:35 -0700 + Commit: d97af68, github.com/apache/spark/pull/8210 + + [SPARK-9960] [GRAPHX] sendMessage type fix in LabelPropagation.scala + zc he + 2015-08-14 21:28:50 -0700 + Commit: 3301500, github.com/apache/spark/pull/8188 + + [SPARK-9634] [SPARK-9323] [SQL] cleanup unnecessary Aliases in LogicalPlan at the end of analysis + Wenchen Fan , Michael Armbrust + 2015-08-14 20:59:54 -0700 + Commit: 83cbf60, github.com/apache/spark/pull/8215 + + [HOTFIX] fix duplicated braces + Davies Liu + 2015-08-14 20:56:55 -0700 + Commit: 3cdeeaf, github.com/apache/spark/pull/8219 + + [SPARK-9934] Deprecate NIO ConnectionManager. + Reynold Xin + 2015-08-14 20:55:32 -0700 + Commit: d842917, github.com/apache/spark/pull/8162 + + [SPARK-9949] [SQL] Fix TakeOrderedAndProject's output. + Yin Huai + 2015-08-14 17:35:17 -0700 + Commit: 6be945c, github.com/apache/spark/pull/8179 + + [SPARK-9968] [STREAMING] Reduced time spent within synchronized block to prevent lock starvation + Tathagata Das + 2015-08-14 15:54:14 -0700 + Commit: 8d26247, github.com/apache/spark/pull/8204 + + [SPARK-9966] [STREAMING] Handle couple of corner cases in PIDRateEstimator + Tathagata Das + 2015-08-14 15:10:01 -0700 + Commit: 612b460, github.com/apache/spark/pull/8199 + + [SPARK-8670] [SQL] Nested columns can't be referenced in pyspark + Wenchen Fan + 2015-08-14 14:09:46 -0700 + Commit: 5bbb2d3, github.com/apache/spark/pull/8202 + + [SPARK-9981] [ML] Made labels public for StringIndexerModel + Joseph K. Bradley + 2015-08-14 14:05:03 -0700 + Commit: 0f4ccdc, github.com/apache/spark/pull/8211 + + [SPARK-9978] [PYSPARK] [SQL] fix Window.orderBy and doc of ntile() + Davies Liu + 2015-08-14 13:55:29 -0700 + Commit: 59cdcc0, github.com/apache/spark/pull/8213 + + [SPARK-9877] [CORE] Fix StandaloneRestServer NPE when submitting application + jerryshao + 2015-08-14 13:44:38 -0700 + Commit: 130e06e, github.com/apache/spark/pull/8127 + + [SPARK-9948] Fix flaky AccumulatorSuite - internal accumulators + Andrew Or + 2015-08-14 13:42:53 -0700 + Commit: 1ce0b01, github.com/apache/spark/pull/8176 + + [SPARK-9809] Task crashes because the internal accumulators are not properly initialized + Carson Wang + 2015-08-14 13:38:25 -0700 + Commit: ff3e956, github.com/apache/spark/pull/8090 + + [SPARK-9828] [PYSPARK] Mutable values should not be default arguments + MechCoder + 2015-08-14 12:46:05 -0700 + Commit: d92568a, github.com/apache/spark/pull/8110 + + [SPARK-9561] Re-enable BroadcastJoinSuite + Andrew Or + 2015-08-14 12:37:21 -0700 + Commit: b284213, github.com/apache/spark/pull/8208 + + [SPARK-9946] [SPARK-9589] [SQL] fix NPE and thread-safety in TaskMemoryManager + Davies Liu + 2015-08-14 12:32:35 -0700 + Commit: e2a288c, github.com/apache/spark/pull/8177 + + [SPARK-8744] [ML] Add a public constructor to StringIndexer + Holden Karau + 2015-08-14 11:22:10 -0700 + Commit: e4ea239, github.com/apache/spark/pull/7267 + + [SPARK-9956] [ML] Make trees work with one-category features + Joseph K. Bradley + 2015-08-14 10:48:02 -0700 + Commit: f5298da, github.com/apache/spark/pull/8187 + + [SPARK-9661] [MLLIB] minor clean-up of SPARK-9661 + Xiangrui Meng + 2015-08-14 10:25:11 -0700 + Commit: 4aa9238, github.com/apache/spark/pull/8190 + + [SPARK-9958] [SQL] Make HiveThriftServer2Listener thread-safe and update the tab name to "JDBC/ODBC Server" + zsxwing + 2015-08-14 14:41:53 +0800 + Commit: a0d52eb, github.com/apache/spark/pull/8185 + + [MINOR] [SQL] Remove canEqual in Row + Liang-Chi Hsieh + 2015-08-13 22:06:09 -0700 + Commit: 00ccb21, github.com/apache/spark/pull/8170 + + [SPARK-9945] [SQL] pageSize should be calculated from executor.memory + Davies Liu + 2015-08-13 21:12:59 -0700 + Commit: 703e3f1, github.com/apache/spark/pull/8175 + + [SPARK-9580] [SQL] Replace singletons in SQL tests + Andrew Or + 2015-08-13 17:42:01 -0700 + Commit: 9df2a2d, github.com/apache/spark/pull/8111 + + [SPARK-9943] [SQL] deserialized UnsafeHashedRelation should be serializable + Davies Liu + 2015-08-13 17:35:11 -0700 + Commit: b318b11, github.com/apache/spark/pull/8174 + + [SPARK-8976] [PYSPARK] fix open mode in python3 + Davies Liu + 2015-08-13 17:33:37 -0700 + Commit: cadc3b7, github.com/apache/spark/pull/8181 + + [SPARK-9922] [ML] rename StringIndexerReverse to IndexToString + Xiangrui Meng + 2015-08-13 16:52:17 -0700 + Commit: 2b6b1d1, github.com/apache/spark/pull/8152 + + [SPARK-9942] [PYSPARK] [SQL] ignore exceptions while try to import pandas + Davies Liu + 2015-08-13 14:03:55 -0700 + Commit: 2c7f8da, github.com/apache/spark/pull/8173 + + [SPARK-9661] [MLLIB] [ML] Java compatibility + MechCoder + 2015-08-13 13:42:35 -0700 + Commit: 875ecc7, github.com/apache/spark/pull/8126 + + [SPARK-9649] Fix MasterSuite, third time's a charm + Andrew Or + 2015-08-13 11:31:10 -0700 + Commit: 3046020 + + [MINOR] [DOC] fix mllib pydoc warnings + Xiangrui Meng + 2015-08-13 10:16:40 -0700 + Commit: 883c7d3, github.com/apache/spark/pull/8169 + + [MINOR] [ML] change MultilayerPerceptronClassifierModel to MultilayerPerceptronClassificationModel + Yanbo Liang + 2015-08-13 09:31:14 -0700 + Commit: 2b13532, github.com/apache/spark/pull/8164 + + [SPARK-8965] [DOCS] Add ml-guide Python Example: Estimator, Transformer, and Param + Rosstin + 2015-08-13 09:18:39 -0700 + Commit: 49085b5, github.com/apache/spark/pull/8081 + + [SPARK-9073] [ML] spark.ml Models copy() should call setParent when there is a parent + lewuathe , Lewuathe + 2015-08-13 09:17:19 -0700 + Commit: fe05142, github.com/apache/spark/pull/7447 + + [SPARK-9757] [SQL] Fixes persistence of Parquet relation with decimal column + Yin Huai , Cheng Lian + 2015-08-13 16:16:50 +0800 + Commit: 5592d16, github.com/apache/spark/pull/8130 + + [SPARK-9885] [SQL] Also pass barrierPrefixes and sharedPrefixes to IsolatedClientLoader when hiveMetastoreJars is set to maven. + Yin Huai + 2015-08-13 15:08:57 +0800 + Commit: 2a600da, github.com/apache/spark/pull/8158 + + [SPARK-9918] [MLLIB] remove runs from k-means and rename epsilon to tol + Xiangrui Meng + 2015-08-12 23:04:59 -0700 + Commit: ae18342, github.com/apache/spark/pull/8148 + + [SPARK-9914] [ML] define setters explicitly for Java and use setParam group in RFormula + Xiangrui Meng + 2015-08-12 22:30:33 -0700 + Commit: d213aa7, github.com/apache/spark/pull/8143 + + [SPARK-9927] [SQL] Revert 8049 since it's pushing wrong filter down + Yijie Shen + 2015-08-13 13:33:39 +0800 + Commit: 694e7a3, github.com/apache/spark/pull/8157 + + [SPARK-8922] [DOCUMENTATION, MLLIB] Add @since tags to mllib.evaluation + shikai.tang + 2015-08-12 21:53:15 -0700 + Commit: 6902840, github.com/apache/spark/pull/7429 + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-12 21:43:13 -0700 + Commit: 8f055e5 + + Preparing Spark release v1.5.0-preview-20150812 + Patrick Wendell + 2015-08-12 21:42:59 -0700 + Commit: cedce9b + + [SPARK-9917] [ML] add getMin/getMax and doc for originalMin/origianlMax in MinMaxScaler + Xiangrui Meng + 2015-08-12 21:33:38 -0700 + Commit: 16f4bf4, github.com/apache/spark/pull/8145 + + [SPARK-9832] [SQL] add a thread-safe lookup for BytesToBytseMap + Davies Liu + 2015-08-12 21:26:00 -0700 + Commit: 8229437, github.com/apache/spark/pull/8151 + + [SPARK-9920] [SQL] The simpleString of TungstenAggregate does not show its output + Yin Huai + 2015-08-12 21:24:15 -0700 + Commit: 3b1b8ea, github.com/apache/spark/pull/8150 + + [SPARK-9916] [BUILD] [SPARKR] removed left-over sparkr.zip copy/create commands from codebase + Burak Yavuz + 2015-08-12 20:59:38 -0700 + Commit: 3d1b9f0, github.com/apache/spark/pull/8147 + + [SPARK-9903] [MLLIB] skip local processing in PrefixSpan if there are no small prefixes + Xiangrui Meng + 2015-08-12 20:44:40 -0700 + Commit: af470a7, github.com/apache/spark/pull/8136 + + [SPARK-9704] [ML] Made ProbabilisticClassifier, Identifiable, VectorUDT public APIs + Joseph K. Bradley + 2015-08-12 20:43:36 -0700 + Commit: a06860c, github.com/apache/spark/pull/8004 + + [SPARK-9199] [CORE] Update Tachyon dependency from 0.7.0 -> 0.7.1. + Calvin Jia + 2015-08-12 20:07:37 -0700 + Commit: c182dc4, github.com/apache/spark/pull/8135 + + [SPARK-9908] [SQL] When spark.sql.tungsten.enabled is false, broadcast join does not work + Yin Huai + 2015-08-12 20:03:55 -0700 + Commit: 71ea61f, github.com/apache/spark/pull/8149 + + [SPARK-9827] [SQL] fix fd leak in UnsafeRowSerializer + Davies Liu + 2015-08-12 20:02:55 -0700 + Commit: eebb3f9, github.com/apache/spark/pull/8116 + + [SPARK-9870] Disable driver UI and Master REST server in SparkSubmitSuite + Josh Rosen + 2015-08-12 18:52:11 -0700 + Commit: 4b547b9, github.com/apache/spark/pull/8124 + + [SPARK-9855] [SPARKR] Add expression functions into SparkR whose params are simple + Yu ISHIKAWA + 2015-08-12 18:33:27 -0700 + Commit: ca39c9e, github.com/apache/spark/pull/8123 + + [SPARK-9780] [STREAMING] [KAFKA] prevent NPE if KafkaRDD instantiation … + cody koeninger + 2015-08-12 17:44:16 -0700 + Commit: 62ab2a4, github.com/apache/spark/pull/8133 + + [SPARK-9449] [SQL] Include MetastoreRelation's inputFiles + Michael Armbrust + 2015-08-12 17:07:29 -0700 + Commit: 3298fb6, github.com/apache/spark/pull/8119 + + [SPARK-9915] [ML] stopWords should use StringArrayParam + Xiangrui Meng + 2015-08-12 17:06:12 -0700 + Commit: ed73f54, github.com/apache/spark/pull/8141 + + [SPARK-9912] [MLLIB] QRDecomposition should use QType and RType for type names instead of UType and VType + Xiangrui Meng + 2015-08-12 17:04:31 -0700 + Commit: 31b7fdc, github.com/apache/spark/pull/8140 + + [SPARK-9909] [ML] [TRIVIAL] move weightCol to shared params + Holden Karau + 2015-08-12 16:54:45 -0700 + Commit: 2f8793b, github.com/apache/spark/pull/8144 + + [SPARK-9913] [MLLIB] LDAUtils should be private + Xiangrui Meng + 2015-08-12 16:53:47 -0700 + Commit: 6aca0cf, github.com/apache/spark/pull/8142 + + [SPARK-9894] [SQL] Json writer should handle MapData. + Yin Huai + 2015-08-12 16:45:15 -0700 + Commit: 08f767a, github.com/apache/spark/pull/8137 + + [SPARK-9826] [CORE] Fix cannot use custom classes in log4j.properties + michellemay + 2015-08-12 16:17:58 -0700 + Commit: 74c9dce, github.com/apache/spark/pull/8109 + + [SPARK-9092] Fixed incompatibility when both num-executors and dynamic... + Niranjan Padmanabhan + 2015-08-12 16:10:21 -0700 + Commit: 8537e51, github.com/apache/spark/pull/7657 + + [SPARK-9907] [SQL] Python crc32 is mistakenly calling md5 + Reynold Xin + 2015-08-12 15:27:52 -0700 + Commit: b28295f, github.com/apache/spark/pull/8138 + + [SPARK-8967] [DOC] add Since annotation + Xiangrui Meng + 2015-08-12 14:28:23 -0700 + Commit: 6a7582e, github.com/apache/spark/pull/8131 + + [SPARK-9789] [ML] Added logreg threshold param back + Joseph K. Bradley + 2015-08-12 14:27:13 -0700 + Commit: bdf8dc1, github.com/apache/spark/pull/8079 + + [SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML + Yanbo Liang + 2015-08-12 13:24:18 -0700 + Commit: 65b5b21, github.com/apache/spark/pull/8059 + + [SPARK-9726] [PYTHON] PySpark DF join no longer accepts on=None + Brennan Ashton + 2015-08-12 11:57:30 -0700 + Commit: 8629c33, github.com/apache/spark/pull/8016 + + [SPARK-9847] [ML] Modified copyValues to distinguish between default, explicit param values + Joseph K. Bradley + 2015-08-12 10:48:52 -0700 + Commit: b515f89, github.com/apache/spark/pull/8115 + + [SPARK-9804] [HIVE] Use correct value for isSrcLocal parameter. + Marcelo Vanzin + 2015-08-12 10:38:30 -0700 + Commit: e9641f1, github.com/apache/spark/pull/8086 + + [SPARK-9747] [SQL] Avoid starving an unsafe operator in aggregation + Andrew Or + 2015-08-12 10:08:35 -0700 + Commit: 4c6b129, github.com/apache/spark/pull/8038 + + [SPARK-7583] [MLLIB] User guide update for RegexTokenizer + Yuhao Yang + 2015-08-12 09:35:32 -0700 + Commit: 2d86fad, github.com/apache/spark/pull/7828 + + [SPARK-9795] Dynamic allocation: avoid double counting when killing same executor twice + Andrew Or + 2015-08-12 09:24:50 -0700 + Commit: bc4ac65, github.com/apache/spark/pull/8078 + + [SPARK-8625] [CORE] Propagate user exceptions in tasks back to driver + Tom White + 2015-08-12 10:06:27 -0500 + Commit: 0579f28, github.com/apache/spark/pull/7014 + + [SPARK-9407] [SQL] Relaxes Parquet ValidTypeMap to allow ENUM predicates to be pushed down + Cheng Lian + 2015-08-12 20:01:34 +0800 + Commit: 5e6fdc6, github.com/apache/spark/pull/8107 + + [SPARK-9182] [SQL] Filters are not passed through to jdbc source + Yijie Shen + 2015-08-12 19:54:00 +0800 + Commit: 8e32db9, github.com/apache/spark/pull/8049 + + [SPARK-9575] [MESOS] Add docuemntation around Mesos shuffle service. + Timothy Chen + 2015-08-11 23:33:22 -0700 + Commit: 5dd0c5c, github.com/apache/spark/pull/7907 + + [SPARK-8798] [MESOS] Allow additional uris to be fetched with mesos + Timothy Chen + 2015-08-11 23:26:33 -0700 + Commit: a2f8057, github.com/apache/spark/pull/7195 + + [SPARK-9426] [WEBUI] Job page DAG visualization is not shown + Carson Wang + 2015-08-11 23:25:02 -0700 + Commit: 93fc959, github.com/apache/spark/pull/8104 + + [SPARK-9829] [WEBUI] Display the update value for peak execution memory + zsxwing + 2015-08-11 23:23:17 -0700 + Commit: d9d4bde, github.com/apache/spark/pull/8121 + + [SPARK-9806] [WEB UI] Don't share ReplayListenerBus between multiple applications + Rohit Agarwal + 2015-08-11 23:20:39 -0700 + Commit: 402c0ca, github.com/apache/spark/pull/8088 + + [SPARK-8366] maxNumExecutorsNeeded should properly handle failed tasks + xutingjun , meiyoula <1039320815@qq.com> + 2015-08-11 23:19:35 -0700 + Commit: 2f90918, github.com/apache/spark/pull/6817 + + [SPARK-9854] [SQL] RuleExecutor.timeMap should be thread-safe + Josh Rosen + 2015-08-11 22:46:59 -0700 + Commit: b994f89, github.com/apache/spark/pull/8120 + + [SPARK-9831] [SQL] fix serialization with empty broadcast + Davies Liu + 2015-08-11 22:45:18 -0700 + Commit: 7024f3e, github.com/apache/spark/pull/8117 + + [SPARK-9713] [ML] Document SparkR MLlib glm() integration in Spark 1.5 + Eric Liang + 2015-08-11 21:26:03 -0700 + Commit: 890c75b, github.com/apache/spark/pull/8085 + + [SPARK-1517] Refactor release scripts to facilitate nightly publishing + Patrick Wendell + 2015-08-11 21:16:48 -0700 + Commit: 6ea33f5, github.com/apache/spark/pull/7411 + + [SPARK-9649] Fix flaky test MasterSuite again - disable REST + Andrew Or + 2015-08-11 20:46:58 -0700 + Commit: 0119edf, github.com/apache/spark/pull/8084 + + [SPARK-9849] [SQL] DirectParquetOutputCommitter qualified name should be backward compatible + Reynold Xin + 2015-08-11 18:08:49 -0700 + Commit: ec7a4b9, github.com/apache/spark/pull/8114 + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-11 18:07:34 -0700 + Commit: b7497e3 + + Preparing Spark release v1.5.0-snapshot-20150811 + Patrick Wendell + 2015-08-11 18:07:22 -0700 + Commit: 158b2ea + + [SPARK-9074] [LAUNCHER] Allow arbitrary Spark args to be set. + Marcelo Vanzin + 2015-08-11 16:33:08 -0700 + Commit: 18d78a8, github.com/apache/spark/pull/7975 + + [HOTFIX] Fix style error caused by ef961ed48a4f45447f0e0ad256b040c7ab2d78d9 + Andrew Or + 2015-08-11 14:52:52 -0700 + Commit: 1067c73 + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-11 14:32:43 -0700 + Commit: 725e5c7 + + Preparing Spark release v1.5.0-snapshot-20150811 + Patrick Wendell + 2015-08-11 14:32:37 -0700 + Commit: e9329ef + + [SPARK-8925] [MLLIB] Add @since tags to mllib.util + Sudhakar Thota , Sudhakar Thota + 2015-08-11 14:31:51 -0700 + Commit: ef961ed, github.com/apache/spark/pull/7436 + + [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility + Feynman Liang + 2015-08-11 14:21:53 -0700 + Commit: 2273e74, github.com/apache/spark/pull/8077 + + [SPARK-9824] [CORE] Fix the issue that InternalAccumulator leaks WeakReference + zsxwing + 2015-08-11 14:06:23 -0700 + Commit: cdf781d, github.com/apache/spark/pull/8108 + + [SPARK-9814] [SQL] EqualNotNull not passing to data sources + hyukjinkwon , 권혁진 + 2015-08-11 14:04:09 -0700 + Commit: eead87e, github.com/apache/spark/pull/8096 + + [SPARK-7726] Add import so Scaladoc doesn't fail. + Patrick Wendell + 2015-08-11 14:02:23 -0700 + Commit: e9d1eab, github.com/apache/spark/pull/8095 + + [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix + Feynman Liang + 2015-08-11 12:49:47 -0700 + Commit: 811d23f, github.com/apache/spark/pull/8042 + + [SPARK-9646] [SQL] Add metrics for all join and aggregate operators + zsxwing + 2015-08-11 12:39:13 -0700 + Commit: 767ee18, github.com/apache/spark/pull/8060 + + [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python + Tathagata Das + 2015-08-11 12:02:28 -0700 + Commit: 71460b8, github.com/apache/spark/pull/8080 + + Fix comment error + Jeff Zhang + 2015-08-11 10:42:17 -0700 + Commit: b077f36, github.com/apache/spark/pull/8097 + + [SPARK-9785] [SQL] HashPartitioning compatibility should consider expression ordering + Josh Rosen + 2015-08-11 08:52:15 -0700 + Commit: efcae3a, github.com/apache/spark/pull/8074 + + [SPARK-9815] Rename PlatformDependent.UNSAFE -> Platform. + Reynold Xin + 2015-08-11 08:41:06 -0700 + Commit: 84ba990, github.com/apache/spark/pull/8094 + + [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent + Tathagata Das + 2015-08-11 02:41:03 -0700 + Commit: ebbd3b6, github.com/apache/spark/pull/8092 + + [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated + Tathagata Das + 2015-08-10 23:41:53 -0700 + Commit: c7f0090, github.com/apache/spark/pull/7961 + + [SPARK-9729] [SPARK-9363] [SQL] Use sort merge join for left and right outer join + Josh Rosen , Daoyuan Wang + 2015-08-10 22:04:41 -0700 + Commit: f9beef9, github.com/apache/spark/pull/5717. + + [SPARK-9340] [SQL] Fixes converting unannotated Parquet lists + Cheng Lian + 2015-08-11 12:46:33 +0800 + Commit: 01efa4f, github.com/apache/spark/pull/8070 + + [SPARK-9801] [STREAMING] Check if file exists before deleting temporary files. + Hao Zhu + 2015-08-10 17:17:22 -0700 + Commit: 94692bb, github.com/apache/spark/pull/8082 + + [SPARK-5155] [PYSPARK] [STREAMING] Mqtt streaming support in Python + Prabeesh K , zsxwing , prabs , Prabeesh K + 2015-08-10 16:33:23 -0700 + Commit: 8f4014f, github.com/apache/spark/pull/4229 + + [SPARK-9737] [YARN] Add the suggested configuration when required executor memory is above the max threshold of this cluster on YARN mode + Yadong Qi + 2015-08-09 19:54:05 +0100 + Commit: 51406be, github.com/apache/spark/pull/8028 + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-10 14:26:56 -0700 + Commit: 0e4f58e + + Preparing Spark release v1.5.0-snapshot-20150810 + Patrick Wendell + 2015-08-10 14:26:49 -0700 + Commit: 3369ad9 + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-10 13:56:56 -0700 + Commit: e51779c + + Preparing Spark release v1.5.0-snapshot-20150810 + Patrick Wendell + 2015-08-10 13:56:50 -0700 + Commit: 2203149 + + [SPARK-9759] [SQL] improve decimal.times() and cast(int, decimalType) + Davies Liu + 2015-08-10 13:55:11 -0700 + Commit: d17303a, github.com/apache/spark/pull/8052 + + [SPARK-9620] [SQL] generated UnsafeProjection should support many columns or large exressions + Davies Liu + 2015-08-10 13:52:18 -0700 + Commit: 2384248, github.com/apache/spark/pull/8044 + + [SPARK-9763][SQL] Minimize exposure of internal SQL classes. + Reynold Xin + 2015-08-10 13:49:23 -0700 + Commit: c1838e4, github.com/apache/spark/pull/8056 + + [SPARK-9784] [SQL] Exchange.isUnsafe should check whether codegen and unsafe are enabled + Josh Rosen + 2015-08-10 13:05:03 -0700 + Commit: d251d9f, github.com/apache/spark/pull/8073 + + Fixed AtmoicReference<> Example + Mahmoud Lababidi + 2015-08-10 13:02:01 -0700 + Commit: 39493b2, github.com/apache/spark/pull/8076 + + [SPARK-9755] [MLLIB] Add docs to MultivariateOnlineSummarizer methods + Feynman Liang + 2015-08-10 11:01:45 -0700 + Commit: 3ee2c8d, github.com/apache/spark/pull/8045 + + [SPARK-9743] [SQL] Fixes JSONRelation refreshing + Cheng Lian + 2015-08-10 09:07:08 -0700 + Commit: 94b2f5b, github.com/apache/spark/pull/8035 + + [SPARK-9777] [SQL] Window operator can accept UnsafeRows + Yin Huai + 2015-08-09 22:33:53 -0700 + Commit: f75c64b, github.com/apache/spark/pull/8064 + + [CORE] [SPARK-9760] Use Option instead of Some for Ivy repos + Shivaram Venkataraman + 2015-08-09 14:30:30 -0700 + Commit: 0e0471d, github.com/apache/spark/pull/8055 + + [SPARK-9703] [SQL] Refactor EnsureRequirements to avoid certain unnecessary shuffles + Josh Rosen + 2015-08-09 14:26:01 -0700 + Commit: 323d686, github.com/apache/spark/pull/7988 + + [SPARK-8930] [SQL] Throw a AnalysisException with meaningful messages if DataFrame#explode takes a star in expressions + Yijie Shen + 2015-08-09 11:44:51 -0700 + Commit: 1ce5061, github.com/apache/spark/pull/8057 + + [SPARK-9752][SQL] Support UnsafeRow in Sample operator. + Reynold Xin + 2015-08-09 10:58:36 -0700 + Commit: b12f073, github.com/apache/spark/pull/8040 + + [SPARK-6212] [SQL] The EXPLAIN output of CTAS only shows the analyzed plan + Yijie Shen + 2015-08-08 21:05:50 -0700 + Commit: 251d1ee, github.com/apache/spark/pull/7986 + + [MINOR] inaccurate comments for showString() + CodingCat + 2015-08-08 18:22:46 -0700 + Commit: 874b9d8, github.com/apache/spark/pull/8050 + + [SPARK-9486][SQL] Add data source aliasing for external packages + Joseph Batchik , Joseph Batchik + 2015-08-08 11:03:01 -0700 + Commit: 06b6234, github.com/apache/spark/pull/7802 + + [SPARK-9728][SQL]Support CalendarIntervalType in HiveQL + Yijie Shen + 2015-08-08 11:01:25 -0700 + Commit: 3c438c7, github.com/apache/spark/pull/8034 + + [SPARK-6902] [SQL] [PYSPARK] Row should be read-only + Davies Liu + 2015-08-08 08:38:18 -0700 + Commit: 3427f57, github.com/apache/spark/pull/8009 + + [SPARK-4561] [PYSPARK] [SQL] turn Row into dict recursively + Davies Liu + 2015-08-08 08:36:14 -0700 + Commit: aaa475c, github.com/apache/spark/pull/8006 + + [SPARK-9738] [SQL] remove FromUnsafe and add its codegen version to GenerateSafe + Wenchen Fan + 2015-08-08 08:33:14 -0700 + Commit: 3ed219f, github.com/apache/spark/pull/8029 + + [SPARK-4176] [SQL] [MINOR] Should use unscaled Long to write decimals for precision <= 18 rather than 8 + Cheng Lian + 2015-08-08 18:09:48 +0800 + Commit: 2cd9632, github.com/apache/spark/pull/8031 + + [SPARK-9731] Standalone scheduling incorrect cores if spark.executor.cores is not set + Carson Wang + 2015-08-07 23:36:26 -0700 + Commit: 2ad75d9, github.com/apache/spark/pull/8017 + + [SPARK-9753] [SQL] TungstenAggregate should also accept InternalRow instead of just UnsafeRow + Yin Huai + 2015-08-07 20:04:17 -0700 + Commit: 47e4735, github.com/apache/spark/pull/8041 + + [SPARK-9754][SQL] Remove TypeCheck in debug package. + Reynold Xin + 2015-08-07 19:09:28 -0700 + Commit: 5598b62, github.com/apache/spark/pull/8043 + + [SPARK-9719] [ML] Clean up Naive Bayes doc + Feynman Liang + 2015-08-07 17:21:12 -0700 + Commit: c5d43d6, github.com/apache/spark/pull/8047 + + [SPARK-9756] [ML] Make constructors in ML decision trees private + Feynman Liang + 2015-08-07 17:19:48 -0700 + Commit: 2a179a9, github.com/apache/spark/pull/8046 + + [SPARK-8890] [SQL] Fallback on sorting when writing many dynamic partitions + Michael Armbrust + 2015-08-07 16:24:50 -0700 + Commit: ea4dfb9, github.com/apache/spark/pull/8010 + + [SPARK-8481] [MLLIB] GaussianMixtureModel predict accepting single vector + Dariusz Kobylarz + 2015-08-07 14:51:03 -0700 + Commit: 2952660, github.com/apache/spark/pull/8039 + + [SPARK-9674] Re-enable ignored test in SQLQuerySuite + Andrew Or + 2015-08-07 14:20:13 -0700 + Commit: 5471202, github.com/apache/spark/pull/8015 + + [SPARK-9733][SQL] Improve physical plan explain for data sources + Reynold Xin + 2015-08-07 13:41:45 -0700 + Commit: d13b5c8, github.com/apache/spark/pull/8024 + + [SPARK-9667][SQL] followup: Use GenerateUnsafeProjection.canSupport to test Exchange supported data types. + Reynold Xin + 2015-08-07 13:26:03 -0700 + Commit: 1b0f784, github.com/apache/spark/pull/8036 + + [SPARK-9736] [SQL] JoinedRow.anyNull should delegate to the underlying rows. + Reynold Xin + 2015-08-07 11:29:13 -0700 + Commit: 70bf170, github.com/apache/spark/pull/8027 + + [SPARK-8382] [SQL] Improve Analysis Unit test framework + Wenchen Fan + 2015-08-07 11:28:43 -0700 + Commit: ff0abca, github.com/apache/spark/pull/8025 + + [SPARK-9674][SPARK-9667] Remove SparkSqlSerializer2 + Reynold Xin + 2015-08-07 11:02:53 -0700 + Commit: 6c2f30c, github.com/apache/spark/pull/7981 + + [SPARK-9467][SQL]Add SQLMetric to specialize accumulators to avoid boxing + zsxwing + 2015-08-07 00:09:58 -0700 + Commit: 7a6f950, github.com/apache/spark/pull/7996 + + [SPARK-9683] [SQL] copy UTF8String when convert unsafe array/map to safe + Wenchen Fan + 2015-08-07 00:00:43 -0700 + Commit: 064ba90, github.com/apache/spark/pull/7990 + + [SPARK-9453] [SQL] support records larger than page size in UnsafeShuffleExternalSorter + Davies Liu + 2015-08-06 23:40:38 -0700 + Commit: 8ece4cc, github.com/apache/spark/pull/8005 + + [SPARK-9700] Pick default page size more intelligently. + Reynold Xin + 2015-08-06 23:18:29 -0700 + Commit: 0e439c2, github.com/apache/spark/pull/8012 + + [SPARK-8862][SQL]Support multiple SQLContexts in Web UI + zsxwing + 2015-08-06 22:52:23 -0700 + Commit: c34fdaf, github.com/apache/spark/pull/7962 + + [SPARK-7550] [SQL] [MINOR] Fixes logs when persisting DataFrames + Cheng Lian + 2015-08-06 22:49:01 -0700 + Commit: aedc8f3, github.com/apache/spark/pull/8021 + + [SPARK-8057][Core]Call TaskAttemptContext.getTaskAttemptID using Reflection + zsxwing + 2015-08-06 21:42:42 -0700 + Commit: e902c4f, github.com/apache/spark/pull/6599 + + Fix doc typo + Jeff Zhang + 2015-08-06 21:03:47 -0700 + Commit: 5491dfb, github.com/apache/spark/pull/8019 + + [SPARK-9709] [SQL] Avoid starving unsafe operators that use sort + Andrew Or + 2015-08-06 19:04:57 -0700 + Commit: 472f0dc, github.com/apache/spark/pull/8011 + + [SPARK-9692] Remove SqlNewHadoopRDD's generated Tuple2 and InterruptibleIterator. + Reynold Xin + 2015-08-06 18:25:38 -0700 + Commit: 37b6403, github.com/apache/spark/pull/8000 + + [SPARK-9650][SQL] Fix quoting behavior on interpolated column names + Michael Armbrust + 2015-08-06 17:31:16 -0700 + Commit: 9be9d38, github.com/apache/spark/pull/7969 + + [SPARK-9228] [SQL] use tungsten.enabled in public for both of codegen/unsafe + Davies Liu + 2015-08-06 17:30:31 -0700 + Commit: b4feccf, github.com/apache/spark/pull/7998 + + [SPARK-9691] [SQL] PySpark SQL rand function treats seed 0 as no seed + Yin Huai + 2015-08-06 17:03:14 -0700 + Commit: 75b4e5a, github.com/apache/spark/pull/7999 + + [SPARK-9633] [BUILD] SBT download locations outdated; need an update + Sean Owen + 2015-08-06 23:43:52 +0100 + Commit: 985e454, github.com/apache/spark/pull/7956 + + [SPARK-9645] [YARN] [CORE] Allow shuffle service to read shuffle files. + Marcelo Vanzin + 2015-08-06 15:30:27 -0700 + Commit: d0a648c, github.com/apache/spark/pull/7966 + + [SPARK-9630] [SQL] Clean up new aggregate operators (SPARK-9240 follow up) + Yin Huai + 2015-08-06 15:04:44 -0700 + Commit: 272e883, github.com/apache/spark/pull/7954 + + [SPARK-9639] [STREAMING] Fix a potential NPE in Streaming JobScheduler + zsxwing + 2015-08-06 14:39:36 -0700 + Commit: 9806872, github.com/apache/spark/pull/7960 + + [DOCS] [STREAMING] make the existing parameter docs for OffsetRange ac… + cody koeninger + 2015-08-06 14:37:25 -0700 + Commit: 8ecfb05, github.com/apache/spark/pull/7995 + + [SPARK-9556] [SPARK-9619] [SPARK-9624] [STREAMING] Make BlockGenerator more robust and make all BlockGenerators subscribe to rate limit updates + Tathagata Das + 2015-08-06 14:35:30 -0700 + Commit: 3997dd3, github.com/apache/spark/pull/7913 + + [SPARK-9548][SQL] Add a destructive iterator for BytesToBytesMap + Liang-Chi Hsieh , Reynold Xin + 2015-08-06 14:33:29 -0700 + Commit: 3137628, github.com/apache/spark/pull/7924. + + [SPARK-9211] [SQL] [TEST] normalize line separators before generating MD5 hash + Christian Kadner + 2015-08-06 14:15:42 -0700 + Commit: 990b4bf, github.com/apache/spark/pull/7563 + + [SPARK-9493] [ML] add featureIndex to handle vector features in IsotonicRegression + Xiangrui Meng + 2015-08-06 13:29:31 -0700 + Commit: ee43d35, github.com/apache/spark/pull/7952 + + [SPARK-6923] [SPARK-7550] [SQL] Persists data source relations in Hive compatible format when possible + Cheng Lian , Cheng Hao + 2015-08-06 11:13:44 +0800 + Commit: 92e8acc, github.com/apache/spark/pull/7967 + + [SPARK-9381] [SQL] Migrate JSON data source to the new partitioning data source + Cheng Hao + 2015-08-05 22:35:55 +0800 + Commit: 3d24767, github.com/apache/spark/pull/7696 + + [SPARK-9618] [SQL] Use the specified schema when reading Parquet files + Nathan Howell + 2015-08-05 22:16:56 +0800 + Commit: d5f7881, github.com/apache/spark/pull/7947 + + [SPARK-8978] [STREAMING] Implements the DirectKafkaRateController + Dean Wampler , Nilanjan Raychaudhuri , François Garillot + 2015-08-06 12:50:08 -0700 + Commit: 8b00c06, github.com/apache/spark/pull/7796 + + [SPARK-9641] [DOCS] spark.shuffle.service.port is not documented + Sean Owen + 2015-08-06 19:29:42 +0100 + Commit: 8a79562, github.com/apache/spark/pull/7991 + + [SPARK-9632] [SQL] [HOT-FIX] Fix build. + Yin Huai + 2015-08-06 11:15:54 -0700 + Commit: b51159d, github.com/apache/spark/pull/8001 + + [SPARK-9632][SQL] update InternalRow.toSeq to make it accept data type info + Wenchen Fan + 2015-08-06 10:40:54 -0700 + Commit: 2382b48, github.com/apache/spark/pull/7955 + + [SPARK-9659][SQL] Rename inSet to isin to match Pandas function. + Reynold Xin + 2015-08-06 10:39:16 -0700 + Commit: 6b8d2d7, github.com/apache/spark/pull/7977 + + [SPARK-9615] [SPARK-9616] [SQL] [MLLIB] Bugs related to FrequentItems when merging and with Tungsten + Burak Yavuz + 2015-08-06 10:29:40 -0700 + Commit: 78f168e, github.com/apache/spark/pull/7945 + + [SPARK-9533] [PYSPARK] [ML] Add missing methods in Word2Vec ML + MechCoder + 2015-08-06 10:09:58 -0700 + Commit: e24b976, github.com/apache/spark/pull/7930 + + [SPARK-9112] [ML] Implement Stats for LogisticRegression + MechCoder + 2015-08-06 10:08:33 -0700 + Commit: 70b9ed1, github.com/apache/spark/pull/7538 + + [SPARK-9593] [SQL] [HOTFIX] Makes the Hadoop shims loading fix more robust + Cheng Lian + 2015-08-06 09:53:53 -0700 + Commit: cc4c569, github.com/apache/spark/pull/7994 + + [SPARK-9593] [SQL] Fixes Hadoop shims loading + Cheng Lian + 2015-08-05 20:03:54 +0800 + Commit: 11c28a5, github.com/apache/spark/pull/7929 + + [SPARK-9482] [SQL] Fix thread-safey issue of using UnsafeProjection in join + Davies Liu + 2015-08-06 09:12:41 -0700 + Commit: c39d5d1, github.com/apache/spark/pull/7940 + + [SPARK-9644] [SQL] Support update DecimalType with precision > 18 in UnsafeRow + Davies Liu + 2015-08-06 09:10:57 -0700 + Commit: 43b30bc, github.com/apache/spark/pull/7978 + + [SPARK-8266] [SQL] add function translate + zhichao.li + 2015-08-06 09:02:30 -0700 + Commit: cab86c4, github.com/apache/spark/pull/7709 + + [SPARK-9664] [SQL] Remove UDAFRegistration and add apply to UserDefinedAggregateFunction. + Yin Huai + 2015-08-05 21:50:35 -0700 + Commit: 29ace3b, github.com/apache/spark/pull/7982 + + [SPARK-9674][SQL] Remove GeneratedAggregate. + Reynold Xin + 2015-08-05 21:50:14 -0700 + Commit: 252eb61, github.com/apache/spark/pull/7983 + + [SPARK-9611] [SQL] Fixes a few corner cases when we spill a UnsafeFixedWidthAggregationMap + Yin Huai + 2015-08-05 19:19:09 -0700 + Commit: f24cd8c, github.com/apache/spark/pull/7948 + + [SPARK-9651] Fix UnsafeExternalSorterSuite. + Marcelo Vanzin + 2015-08-05 17:58:36 -0700 + Commit: eb2229a, github.com/apache/spark/pull/7970 + + [SPARK-6591] [SQL] Python data source load options should auto convert common types into strings + Yijie Shen + 2015-08-05 17:28:23 -0700 + Commit: 5f037b3, github.com/apache/spark/pull/7926 + + [SPARK-5895] [ML] Add VectorSlicer - updated + Xusen Yin , Joseph K. Bradley + 2015-08-05 17:07:55 -0700 + Commit: 3b617e8, github.com/apache/spark/pull/7972 + + [SPARK-9054] [SQL] Rename RowOrdering to InterpretedOrdering; use newOrdering in SMJ + Josh Rosen + 2015-08-05 16:33:42 -0700 + Commit: 618dc63, github.com/apache/spark/pull/7973 + + [SPARK-9657] Fix return type of getMaxPatternLength + Feynman Liang + 2015-08-05 15:42:18 -0700 + Commit: 30e9fcf, github.com/apache/spark/pull/7974 + + [SPARK-9649] Fix flaky test MasterSuite - randomize ports + Andrew Or + 2015-08-05 14:12:22 -0700 + Commit: 05cbf13, github.com/apache/spark/pull/7968 + + [SPARK-9403] [SQL] Add codegen support in In and InSet + Liang-Chi Hsieh , Tarek Auel + 2015-08-05 11:38:56 -0700 + Commit: b8136d7, github.com/apache/spark/pull/7893 + + [SPARK-9141] [SQL] [MINOR] Fix comments of PR #7920 + Yin Huai + 2015-08-05 11:03:02 -0700 + Commit: 19018d5, github.com/apache/spark/pull/7964 + + [SPARK-9519] [YARN] Confirm stop sc successfully when application was killed + linweizhong + 2015-08-05 10:16:12 -0700 + Commit: 03bcf62, github.com/apache/spark/pull/7846 + + [SPARK-9141] [SQL] Remove project collapsing from DataFrame API + Michael Armbrust + 2015-08-05 09:01:45 -0700 + Commit: 125827a, github.com/apache/spark/pull/7920 + + [SPARK-6486] [MLLIB] [PYTHON] Add BlockMatrix to PySpark. + Mike Dusenberry + 2015-08-05 07:40:50 -0700 + Commit: eedb996, github.com/apache/spark/pull/7761 + + [SPARK-9607] [SPARK-9608] fix zinc-port handling in build/mvn + Ryan Williams + 2015-08-05 11:10:47 +0100 + Commit: 3500064, github.com/apache/spark/pull/7944 + + [HOTFIX] Add static import to fix build break from #7676. + Josh Rosen + 2015-08-05 02:39:41 -0700 + Commit: 93c166a + + [SPARK-9628][SQL]Rename int to SQLDate, long to SQLTimestamp for better readability + Yijie Shen + 2015-08-05 02:04:28 -0700 + Commit: f288cca, github.com/apache/spark/pull/7953 + + [SPARK-8861][SPARK-8862][SQL] Add basic instrumentation to each SparkPlan operator and add a new SQL tab + zsxwing + 2015-08-05 01:51:22 -0700 + Commit: ebc3aad, github.com/apache/spark/pull/7774 + + [SPARK-9601] [DOCS] Fix JavaPairDStream signature for stream-stream and windowed join in streaming guide doc + Namit Katariya + 2015-08-05 01:07:33 -0700 + Commit: 6306019, github.com/apache/spark/pull/7935 + + [SPARK-9360] [SQL] Support BinaryType in PrefixComparators for UnsafeExternalSort + Takeshi YAMAMURO + 2015-08-05 00:54:31 -0700 + Commit: 7fa4195, github.com/apache/spark/pull/7676 + + [SPARK-9581][SQL] Add unit test for JSON UDT + Emiliano Leporati , Reynold Xin + 2015-08-05 00:42:08 -0700 + Commit: 57596fb, github.com/apache/spark/pull/7917 + + [SPARK-9217] [STREAMING] Make the kinesis receiver reliable by recording sequence numbers + Tathagata Das + 2015-08-05 00:20:26 -0700 + Commit: ea23e54, github.com/apache/spark/pull/7825 + + Update docs/README.md to put all prereqs together. + Reynold Xin + 2015-08-04 22:17:14 -0700 + Commit: b6e8446, github.com/apache/spark/pull/7951 + + Add a prerequisites section for building docs + Shivaram Venkataraman + 2015-08-03 17:00:59 -0700 + Commit: 141f034, github.com/apache/spark/pull/7912 + + [SPARK-9119] [SPARK-8359] [SQL] match Decimal.precision/scale with DecimalType + Davies Liu + 2015-08-04 23:12:49 -0700 + Commit: 864d5de, github.com/apache/spark/pull/7925 + + [SPARK-8231] [SQL] Add array_contains + Pedro Rodriguez , Pedro Rodriguez , Davies Liu + 2015-08-04 22:32:21 -0700 + Commit: 28bb977, github.com/apache/spark/pull/7580 + + [SPARK-9540] [MLLIB] optimize PrefixSpan implementation + Xiangrui Meng + 2015-08-04 22:28:49 -0700 + Commit: bca1967, github.com/apache/spark/pull/7594 + + [SPARK-9504] [STREAMING] [TESTS] Fix o.a.s.streaming.StreamingContextSuite.stop gracefully again + zsxwing + 2015-08-04 20:09:15 -0700 + Commit: 6e72d24, github.com/apache/spark/pull/7934 + + [SPARK-9513] [SQL] [PySpark] Add python API for DataFrame functions + Davies Liu + 2015-08-04 19:25:24 -0700 + Commit: d196d36, github.com/apache/spark/pull/7922 + + [SPARK-7119] [SQL] Give script a default serde with the user specific types + zhichao.li + 2015-08-04 18:26:05 -0700 + Commit: f957c59, github.com/apache/spark/pull/6638 + + [SPARK-8313] R Spark packages support + Burak Yavuz + 2015-08-04 18:20:12 -0700 + Commit: 11d2311, github.com/apache/spark/pull/7139 + + [SPARK-9432][SQL] Audit expression unit tests to make sure we pass the proper numeric ranges + Yijie Shen + 2015-08-04 18:19:26 -0700 + Commit: 02a6333, github.com/apache/spark/pull/7933 + + [SPARK-8601] [ML] Add an option to disable standardization for linear regression + Holden Karau , DB Tsai + 2015-08-04 18:15:26 -0700 + Commit: 2237ddb, github.com/apache/spark/pull/7875 + + [SPARK-9609] [MLLIB] Fix spelling of Strategy.defaultStrategy + Feynman Liang + 2015-08-04 18:13:18 -0700 + Commit: 3350975, github.com/apache/spark/pull/7941 + + [SPARK-9598][SQL] do not expose generic getter in internal row + Wenchen Fan + 2015-08-04 17:05:19 -0700 + Commit: 1954a7b, github.com/apache/spark/pull/7932 + + [SPARK-9586] [ML] Update BinaryClassificationEvaluator to use setRawPredictionCol + Joseph K. Bradley + 2015-08-04 16:52:43 -0700 + Commit: cff0fe2, github.com/apache/spark/pull/7921 + + [SPARK-6485] [MLLIB] [PYTHON] Add CoordinateMatrix/RowMatrix/IndexedRowMatrix to PySpark. + Mike Dusenberry + 2015-08-04 16:30:03 -0700 + Commit: f4e125a, github.com/apache/spark/pull/7554 + + [SPARK-9582] [ML] LDA cleanups + Joseph K. Bradley + 2015-08-04 15:43:13 -0700 + Commit: fe4a4f4, github.com/apache/spark/pull/7916 + + [SPARK-9447] [ML] [PYTHON] Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier + Joseph K. Bradley + 2015-08-04 14:54:26 -0700 + Commit: e682ee2, github.com/apache/spark/pull/7903 + + [SPARK-9602] remove "Akka/Actor" words from comments + CodingCat + 2015-08-04 14:54:11 -0700 + Commit: 560b2da, github.com/apache/spark/pull/7936 + + [SPARK-9452] [SQL] Support records larger than page size in UnsafeExternalSorter + Josh Rosen + 2015-08-04 14:42:11 -0700 + Commit: f771a83, github.com/apache/spark/pull/7891 + + [SPARK-9553][SQL] remove the no-longer-necessary createCode and createStructCode, and replace the usage + Wenchen Fan + 2015-08-04 14:40:46 -0700 + Commit: 43f6b02, github.com/apache/spark/pull/7890 + + [SPARK-9606] [SQL] Ignore flaky thrift server tests + Michael Armbrust + 2015-08-04 12:19:52 -0700 + Commit: be37b1b, github.com/apache/spark/pull/7939 + + [SPARK-8069] [ML] Add multiclass thresholds for ProbabilisticClassifier + Holden Karau , Joseph K. Bradley + 2015-08-04 10:12:22 -0700 + Commit: c5250dd, github.com/apache/spark/pull/7909 + + [SPARK-9512][SQL] Revert SPARK-9251, Allow evaluation while sorting + Michael Armbrust + 2015-08-04 10:07:53 -0700 + Commit: a9277cd, github.com/apache/spark/pull/7906 + + [SPARK-9562] Change reference to amplab/spark-ec2 from mesos/ + Shivaram Venkataraman + 2015-08-04 09:40:07 -0700 + Commit: aa8390d, github.com/apache/spark/pull/7899 + + [SPARK-9541] [SQL] DataTimeUtils cleanup + Yijie Shen + 2015-08-04 09:09:52 -0700 + Commit: d875368, github.com/apache/spark/pull/7870 + + [SPARK-8246] [SQL] Implement get_json_object + Davies Liu , Yin Huai , Nathan Howell + 2015-08-04 09:07:09 -0700 + Commit: b42e13d, github.com/apache/spark/pull/7901 + + [SPARK-8244] [SQL] string function: find in set + Tarek Auel , Davies Liu + 2015-08-04 08:59:42 -0700 + Commit: 945da35, github.com/apache/spark/pull/7186 + + [SPARK-9583] [BUILD] Do not print mvn debug messages to stdout. + Marcelo Vanzin + 2015-08-04 22:19:11 +0900 + Commit: f44b27a, github.com/apache/spark/pull/7915 + + [SPARK-2016] [WEBUI] RDD partition table pagination for the RDD Page + Carson Wang + 2015-08-04 22:12:30 +0900 + Commit: 45c8d2b, github.com/apache/spark/pull/7692 + + [SPARK-8064] [BUILD] Follow-up. Undo change from SPARK-9507 that was accidentally reverted + tedyu + 2015-08-04 12:22:53 +0100 + Commit: bd9b752, github.com/apache/spark/pull/7919 + + [SPARK-9534] [BUILD] Enable javac lint for scalac parity; fix a lot of build warnings, 1.5.0 edition + Sean Owen + 2015-08-04 12:02:26 +0100 + Commit: 5ae6753, github.com/apache/spark/pull/7862 + + [SPARK-3190] [GRAPHX] Fix VertexRDD.count() overflow regression + Ankur Dave + 2015-08-03 23:07:32 -0700 + Commit: 29f2d5a, github.com/apache/spark/pull/7923 + + [SPARK-9521] [DOCS] Addendum. Require Maven 3.3.3+ in the build + Sean Owen + 2015-08-04 13:48:22 +0900 + Commit: 1f7dbcd, github.com/apache/spark/pull/7905 + + [SPARK-9577][SQL] Surface concrete iterator types in various sort classes. + Reynold Xin + 2015-08-03 18:47:02 -0700 + Commit: ebe42b9, github.com/apache/spark/pull/7911 + + [SPARK-8416] highlight and topping the executor threads in thread dumping page + CodingCat + 2015-08-03 18:20:40 -0700 + Commit: 93076ae, github.com/apache/spark/pull/7808 + + [SPARK-9263] Added flags to exclude dependencies when using --packages + Burak Yavuz + 2015-08-03 17:42:03 -0700 + Commit: 3433571, github.com/apache/spark/pull/7599 + + [SPARK-9483] Fix UTF8String.getPrefix for big-endian. + Matthew Brandyberry + 2015-08-03 17:36:56 -0700 + Commit: 73c863a, github.com/apache/spark/pull/7902 + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-03 16:59:19 -0700 + Commit: 74792e7 + + Preparing Spark release v1.5.0-snapshot-20150803 + Patrick Wendell + 2015-08-03 16:59:13 -0700 + Commit: 7e7147f + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-03 16:54:56 -0700 + Commit: bc49ca4 + + Preparing Spark release v1.5.0-snapshot-20150803 + Patrick Wendell + 2015-08-03 16:54:50 -0700 + Commit: 4c4f638 + + [SPARK-8874] [ML] Add missing methods in Word2Vec + MechCoder + 2015-08-03 16:44:25 -0700 + Commit: acda9d9, github.com/apache/spark/pull/7263 + + Preparing development version 1.5.0-SNAPSHOT + Patrick Wendell + 2015-08-03 16:37:34 -0700 + Commit: 73fab88 + + Preparing Spark release v1.5.0-snapshot-20150803 + Patrick Wendell + 2015-08-03 16:37:27 -0700 + Commit: 3526420 + + [SPARK-8064] [SQL] Build against Hive 1.2.1 + Steve Loughran , Cheng Lian , Michael Armbrust , Patrick Wendell + 2015-08-03 15:24:34 -0700 + Commit: 6bd12e8, github.com/apache/spark/pull/7191 + + Revert "[SPARK-9372] [SQL] Filter nulls in join keys" + Reynold Xin + 2015-08-03 14:51:36 -0700 + Commit: db58327 + + [SPARK-8735] [SQL] Expose memory usage for shuffles, joins and aggregations + Andrew Or + 2015-08-03 14:22:07 -0700 + Commit: 29756ff, github.com/apache/spark/pull/7770 + + [SPARK-9191] [ML] [Doc] Add ml.PCA user guide and code examples + Yanbo Liang + 2015-08-03 13:58:00 -0700 + Commit: e7329ab, github.com/apache/spark/pull/7522 + + [SPARK-9544] [MLLIB] add Python API for RFormula + Xiangrui Meng + 2015-08-03 13:59:35 -0700 + Commit: dc0c8c9, github.com/apache/spark/pull/7879 + + [SPARK-9558][DOCS]Update docs to follow the increase of memory defaults. + Kousuke Saruta + 2015-08-03 12:53:44 -0700 + Commit: 444058d, github.com/apache/spark/pull/7896 + + [SPARK-5133] [ML] Added featureImportance to RandomForestClassifier and Regressor + Joseph K. Bradley , Feynman Liang + 2015-08-03 12:17:46 -0700 + Commit: b3117d3, github.com/apache/spark/pull/7838 + + [SPARK-9554] [SQL] Enables in-memory partition pruning by default + Cheng Lian + 2015-08-03 12:06:58 -0700 + Commit: 6d46e9b, github.com/apache/spark/pull/7895 + + [SQL][minor] Simplify UnsafeRow.calculateBitSetWidthInBytes. + Reynold Xin + 2015-08-03 11:22:02 -0700 + Commit: 5452e93, github.com/apache/spark/pull/7897 + + [SPARK-9511] [SQL] Fixed Table Name Parsing + Joseph Batchik + 2015-08-03 11:17:38 -0700 + Commit: 4de833e, github.com/apache/spark/pull/7844 + + [SPARK-1855] Local checkpointing + Andrew Or + 2015-08-03 10:58:37 -0700 + Commit: b41a327, github.com/apache/spark/pull/7279 + + [SPARK-9528] [ML] Changed RandomForestClassifier to extend ProbabilisticClassifier + Joseph K. Bradley + 2015-08-03 10:46:34 -0700 + Commit: 69f5a7c, github.com/apache/spark/pull/7859 + + Two minor comments from code review on 191bf2689. + Reynold Xin + 2015-08-03 04:26:18 -0700 + Commit: 8be198c + + [SPARK-9518] [SQL] cleanup generated UnsafeRowJoiner and fix bug + Davies Liu + 2015-08-03 04:23:26 -0700 + Commit: 191bf26, github.com/apache/spark/pull/7892 + + [SPARK-9551][SQL] add a cheap version of copy for UnsafeRow to reuse a copy buffer + Wenchen Fan + 2015-08-03 04:21:15 -0700 + Commit: 137f478, github.com/apache/spark/pull/7885 + + [SPARK-8873] [MESOS] Clean up shuffle files if external shuffle service is used + Timothy Chen , Andrew Or + 2015-08-03 01:55:58 -0700 + Commit: 95dccc6, github.com/apache/spark/pull/7881 + + [SPARK-9240] [SQL] Hybrid aggregate operator using unsafe row + Yin Huai + 2015-08-03 00:23:08 -0700 + Commit: 1ebd41b, github.com/apache/spark/pull/7813 + + [SPARK-9549][SQL] fix bugs in expressions + Yijie Shen + 2015-08-03 00:15:24 -0700 + Commit: 98d6d9c, github.com/apache/spark/pull/7882 + + [SPARK-9404][SPARK-9542][SQL] unsafe array data and map data + Wenchen Fan + 2015-08-02 23:41:16 -0700 + Commit: 608353c, github.com/apache/spark/pull/7752 + + [SPARK-9372] [SQL] Filter nulls in join keys + Yin Huai , Josh Rosen + 2015-08-02 23:32:09 -0700 + Commit: 687c8c3, github.com/apache/spark/pull/7768 + + [SPARK-9536] [SPARK-9537] [SPARK-9538] [ML] [PYSPARK] ml.classification support raw and probability prediction for PySpark + Yanbo Liang + 2015-08-02 22:19:27 -0700 + Commit: 4cdd8ec, github.com/apache/spark/pull/7866 + + [SPARK-2205] [SQL] Avoid unnecessary exchange operators in multi-way joins + Yin Huai , Josh Rosen + 2015-08-02 20:44:23 -0700 + Commit: 114ff92, github.com/apache/spark/pull/7773 + + [SPARK-9546][SQL] Centralize orderable data type checking. + Reynold Xin + 2015-08-02 20:12:03 -0700 + Commit: 30e8911, github.com/apache/spark/pull/7880 + + [SPARK-9535][SQL][DOCS] Modify document for codegen. + KaiXinXiaoLei , Kousuke Saruta + 2015-08-02 20:04:21 -0700 + Commit: 536d2ad, github.com/apache/spark/pull/7142 + + [SPARK-9543][SQL] Add randomized testing for UnsafeKVExternalSorter. + Reynold Xin + 2015-08-02 17:54:30 -0700 + Commit: 9d03ad9, github.com/apache/spark/pull/7873 + + [SPARK-7937][SQL] Support comparison on StructType + Liang-Chi Hsieh , Liang-Chi Hsieh , Reynold Xin + 2015-08-02 17:53:44 -0700 + Commit: 0722f43, github.com/apache/spark/pull/6519. + + [SPARK-9531] [SQL] UnsafeFixedWidthAggregationMap.destructAndCreateExternalSorter + Reynold Xin , Josh Rosen + 2015-08-02 12:32:14 -0700 + Commit: 2e981b7, github.com/apache/spark/pull/7860 + + [SPARK-9527] [MLLIB] add PrefixSpanModel and make PrefixSpan Java friendly + Xiangrui Meng + 2015-08-02 11:50:17 -0700 + Commit: 66924ff, github.com/apache/spark/pull/7869 + + [SPARK-9208][SQL] Sort DataFrame functions alphabetically. + Reynold Xin + 2015-08-02 11:36:11 -0700 + Commit: 8eafa2a, github.com/apache/spark/pull/7861 + + [SPARK-9149] [ML] [EXAMPLES] Add an example of spark.ml KMeans + Yu ISHIKAWA + 2015-08-02 09:00:32 +0100 + Commit: 244016a, github.com/apache/spark/pull/7697 + + [SPARK-9521] [BUILD] Require Maven 3.3.3+ in the build + Sean Owen + 2015-08-02 08:56:35 +0100 + Commit: 9d1c025, github.com/apache/spark/pull/7852 + + [SPARK-9529] [SQL] improve TungstenSort on DecimalType + Davies Liu + 2015-08-01 23:36:06 -0700 + Commit: 16b928c, github.com/apache/spark/pull/7857 + + [SPARK-9000] [MLLIB] Support generic item types in PrefixSpan + Feynman Liang , masaki rikitoku + 2015-08-01 23:11:25 -0700 + Commit: 28d944e, github.com/apache/spark/pull/7400 + + [SPARK-9459] [SQL] use generated FromUnsafeProjection to do deep copy for UTF8String and struct + Davies Liu + 2015-08-01 21:50:42 -0700 + Commit: 57084e0, github.com/apache/spark/pull/7840 + + [SPARK-8185] [SPARK-8188] [SPARK-8191] [SQL] function datediff, to_utc_timestamp, from_utc_timestamp + Davies Liu , Daoyuan Wang + 2015-08-01 21:46:46 -0700 + Commit: c1b0cbd, github.com/apache/spark/pull/7847 + + [SPARK-8269] [SQL] string function: initcap + HuJiayin , Davies Liu + 2015-08-01 21:44:57 -0700 + Commit: 00cd92f, github.com/apache/spark/pull/7208 + + [SPARK-9495] prefix of DateType/TimestampType + Davies Liu + 2015-08-01 18:22:46 -0700 + Commit: 5d9e33d, github.com/apache/spark/pull/7856 + + [SPARK-9530] [MLLIB] ScalaDoc should not indicate LDAModel.describeTopics and DistributedLDAModel.topDocumentsPerTopic as approximate + Meihua Wu + 2015-08-01 17:13:28 -0700 + Commit: 84a6982, github.com/apache/spark/pull/7858 + + [SPARK-9520] [SQL] Support in-place sort in UnsafeFixedWidthAggregationMap + Reynold Xin + 2015-08-01 13:20:26 -0700 + Commit: 3d1535d, github.com/apache/spark/pull/7849 + + [SPARK-9491] Avoid fetching HBase tokens when not needed. + Marcelo Vanzin + 2015-08-01 13:06:50 -0700 + Commit: df733cb, github.com/apache/spark/pull/7810 + + [SPARK-4751] Dynamic allocation in standalone mode + Andrew Or + 2015-08-01 11:57:14 -0700 + Commit: 6688ba6, github.com/apache/spark/pull/7532 + + [SPARK-8263] [SQL] substr/substring should also support binary type + zhichao.li , Davies Liu + 2015-08-01 08:48:46 -0700 + Commit: c5166f7, github.com/apache/spark/pull/7641 + + [SPARK-8232] [SQL] Add sort_array support + Cheng Hao , Davies Liu + 2015-08-01 08:32:29 -0700 + Commit: cf6c9ca, github.com/apache/spark/pull/7851 + + [SPARK-8169] [ML] Add StopWordsRemover as a transformer + Yuhao Yang + 2015-08-01 02:31:28 -0700 + Commit: 8765665, github.com/apache/spark/pull/6742 + + [SPARK-8999] [MLLIB] PrefixSpan non-temporal sequences + zhangjiajin , Feynman Liang , zhang jiajin + 2015-08-01 01:56:27 -0700 + Commit: d2a9b66, github.com/apache/spark/pull/7646 + + [SPARK-7446] [MLLIB] Add inverse transform for string indexer + Holden Karau + 2015-08-01 01:09:38 -0700 + Commit: 6503897, github.com/apache/spark/pull/6339 + + Revert "[SPARK-8232] [SQL] Add sort_array support" + Davies Liu + 2015-08-01 00:41:15 -0700 + Commit: 60ea7ab + + [SPARK-9480][SQL] add MapData and cleanup internal row stuff + Wenchen Fan + 2015-08-01 00:17:15 -0700 + Commit: 1d59a41, github.com/apache/spark/pull/7799 + + [SPARK-9517][SQL] BytesToBytesMap should encode data the same way as UnsafeExternalSorter + Reynold Xin + 2015-07-31 23:55:16 -0700 + Commit: d90f2cf, github.com/apache/spark/pull/7845 + + [SPARK-8232] [SQL] Add sort_array support + Cheng Hao + 2015-07-31 23:11:22 -0700 + Commit: 67ad4e2, github.com/apache/spark/pull/7581 + + [SPARK-9415][SQL] Throw AnalysisException when using MapType on Join and Aggregate + Liang-Chi Hsieh + 2015-07-31 22:26:30 -0700 + Commit: 3320b0b, github.com/apache/spark/pull/7819 + + [SPARK-9464][SQL] Property checks for UTF8String + Josh Rosen , Yijie Shen + 2015-07-31 21:19:23 -0700 + Commit: 14f2634, github.com/apache/spark/pull/7830 + + [SPARK-8264][SQL]add substring_index function + zhichao.li , Davies Liu + 2015-07-31 21:18:01 -0700 + Commit: 6996bd2, github.com/apache/spark/pull/7533 + + [SPARK-9358][SQL] Code generation for UnsafeRow joiner. + Reynold Xin + 2015-07-31 21:09:00 -0700 + Commit: 03377d2, github.com/apache/spark/pull/7821 + + [SPARK-9318] [SPARK-9320] [SPARKR] Aliases for merge and summary functions on DataFrames + Hossein + 2015-07-31 19:24:00 -0700 + Commit: 712f5b7, github.com/apache/spark/pull/7806 + + [SPARK-9451] [SQL] Support entries larger than default page size in BytesToBytesMap & integrate with ShuffleMemoryManager + Josh Rosen + 2015-07-31 19:19:27 -0700 + Commit: 8cb415a, github.com/apache/spark/pull/7762 + + [SPARK-8936] [MLLIB] OnlineLDA document-topic Dirichlet hyperparameter optimization + Feynman Liang + 2015-07-31 18:36:22 -0700 + Commit: f51fd6f, github.com/apache/spark/pull/7836 + + [SPARK-8271][SQL]string function: soundex + HuJiayin , Davies Liu + 2015-07-31 16:05:26 -0700 + Commit: 4d5a6e7, github.com/apache/spark/pull/7812 + + [SPARK-9233] [SQL] Enable code-gen in window function unit tests + Yin Huai + 2015-07-31 14:13:06 -0700 + Commit: 3fc0cb9, github.com/apache/spark/pull/7832 + + [SPARK-9324] [SPARK-9322] [SPARK-9321] [SPARKR] Some aliases for R-like functions in DataFrames + Hossein + 2015-07-31 14:07:41 -0700 + Commit: 710c2b5, github.com/apache/spark/pull/7764 + + [SPARK-9510] [SPARKR] Remaining SparkR style fixes + Shivaram Venkataraman + 2015-07-31 14:02:44 -0700 + Commit: 82f47b8, github.com/apache/spark/pull/7834 + + [SPARK-9507] [BUILD] Remove dependency reduced POM hack now that shade plugin is updated + Sean Owen + 2015-07-31 21:51:55 +0100 + Commit: 6e5fd61, github.com/apache/spark/pull/7826 + + [SPARK-9490] [DOCS] [MLLIB] MLlib evaluation metrics guide example python code uses deprecated print statement + Sean Owen + 2015-07-31 13:45:28 -0700 + Commit: 873ab0f, github.com/apache/spark/pull/7822 + + [SPARK-9466] [SQL] Increate two timeouts in CliSuite. + Yin Huai + 2015-07-31 13:45:12 -0700 + Commit: 815c824, github.com/apache/spark/pull/7777 + + [SPARK-9308] [ML] ml.NaiveBayesModel support predicting class probabilities + Yanbo Liang + 2015-07-31 13:11:42 -0700 + Commit: fbef566, github.com/apache/spark/pull/7672 + + [SPARK-9056] [STREAMING] Rename configuration `spark.streaming.minRememberDuration` to `spark.streaming.fileStream.minRememberDuration` + Sameer Abhyankar , Sameer Abhyankar + 2015-07-31 13:08:55 -0700 + Commit: 060c79a, github.com/apache/spark/pull/7740 + + [SPARK-9246] [MLLIB] DistributedLDAModel predict top docs per topic + Meihua Wu + 2015-07-31 13:01:10 -0700 + Commit: 3c0d2e5, github.com/apache/spark/pull/7769 + + [SPARK-9202] capping maximum number of executor&driver information kept in Worker + CodingCat + 2015-07-31 20:27:00 +0100 + Commit: c068666, github.com/apache/spark/pull/7714 + + [SPARK-9481] Add logLikelihood to LocalLDAModel + Feynman Liang + 2015-07-31 12:12:22 -0700 + Commit: a8340fa, github.com/apache/spark/pull/7801 + + [SPARK-9504] [STREAMING] [TESTS] Use eventually to fix the flaky test + zsxwing + 2015-07-31 12:10:55 -0700 + Commit: d046347, github.com/apache/spark/pull/7823 + + [SPARK-8564] [STREAMING] Add the Python API for Kinesis + zsxwing + 2015-07-31 12:09:48 -0700 + Commit: 3afc1de, github.com/apache/spark/pull/6955 + + [SPARK-8640] [SQL] Enable Processing of Multiple Window Frames in a Single Window Operator + Herman van Hovell + 2015-07-31 12:07:18 -0700 + Commit: 39ab199, github.com/apache/spark/pull/7515 + + [SPARK-8979] Add a PID based rate estimator + Iulian Dragos , François Garillot + 2015-07-31 12:04:03 -0700 + Commit: 0a1d2ca, github.com/apache/spark/pull/7648 + + [SPARK-6885] [ML] decision tree support predict class probabilities + Yanbo Liang + 2015-07-31 11:56:52 -0700 + Commit: e8bdcde, github.com/apache/spark/pull/7694 + + [SPARK-9231] [MLLIB] DistributedLDAModel method for top topics per document + Yuhao Yang + 2015-07-31 11:50:15 -0700 + Commit: 4011a94, github.com/apache/spark/pull/7785 + + [SPARK-9471] [ML] Multilayer Perceptron + Alexander Ulanov , Bert Greevenbosch + 2015-07-31 11:22:40 -0700 + Commit: 6add4ed, github.com/apache/spark/pull/7621 + + [SQL] address comments for to_date/trunc + Davies Liu + 2015-07-31 11:07:34 -0700 + Commit: 0024da9, github.com/apache/spark/pull/7817 + + [SPARK-9446] Clear Active SparkContext in stop() method + tedyu + 2015-07-31 18:16:55 +0100 + Commit: 27ae851, github.com/apache/spark/pull/7756 + + [SPARK-9497] [SPARK-9509] [CORE] Use ask instead of askWithRetry + zsxwing + 2015-07-31 09:34:10 -0700 + Commit: 04a49ed, github.com/apache/spark/pull/7824 + + [SPARK-9053] [SPARKR] Fix spaces around parens, infix operators etc. + Yu ISHIKAWA + 2015-07-31 09:33:38 -0700 + Commit: fc0e57e, github.com/apache/spark/pull/7584 + + [SPARK-9500] add TernaryExpression to simplify ternary expressions + Davies Liu + 2015-07-31 08:28:05 -0700 + Commit: 6bba750, github.com/apache/spark/pull/7816 + + [SPARK-9496][SQL]do not print the password in config + WangTaoTheTonic + 2015-07-30 23:50:06 -0700 + Commit: a3a85d7, github.com/apache/spark/pull/7815 + + [SPARK-9152][SQL] Implement code generation for Like and RLike + Liang-Chi Hsieh + 2015-07-30 23:05:58 -0700 + Commit: 0244170, github.com/apache/spark/pull/7561 + + [SPARK-9214] [ML] [PySpark] support ml.NaiveBayes for Python + Yanbo Liang + 2015-07-30 23:03:48 -0700 + Commit: 69b62f7, github.com/apache/spark/pull/7568 + + [SPARK-7690] [ML] Multiclass classification Evaluator + Ram Sriharsha + 2015-07-30 23:02:11 -0700 + Commit: 4e5919b, github.com/apache/spark/pull/7475 + + [SPARK-8176] [SPARK-8197] [SQL] function to_date/ trunc + Daoyuan Wang , Davies Liu + 2015-07-30 19:22:38 -0700 + Commit: 83670fc, github.com/apache/spark/pull/6988 + + [SPARK-9472] [STREAMING] consistent hadoop configuration, streaming only + cody koeninger + 2015-07-30 17:44:20 -0700 + Commit: 9307f56, github.com/apache/spark/pull/7772 + + [SPARK-9489] Remove unnecessary compatibility and requirements checks from Exchange + Josh Rosen + 2015-07-30 17:38:48 -0700 + Commit: 3c66ff7, github.com/apache/spark/pull/7807 + + [SPARK-9077] [MLLIB] Improve error message for decision trees when numExamples < maxCategoriesPerFeature + Sean Owen + 2015-07-30 17:26:18 -0700 + Commit: 65fa418, github.com/apache/spark/pull/7800 + + [SPARK-6319][SQL] Throw AnalysisException when using BinaryType on Join and Aggregate + Liang-Chi Hsieh + 2015-07-30 17:22:51 -0700 + Commit: 351eda0, github.com/apache/spark/pull/7787 + + [SPARK-9425] [SQL] support DecimalType in UnsafeRow + Davies Liu + 2015-07-30 17:18:32 -0700 + Commit: 0b1a464, github.com/apache/spark/pull/7758 + + [SPARK-9458][SPARK-9469][SQL] Code generate prefix computation in sorting & moves unsafe conversion out of TungstenSort. + Reynold Xin + 2015-07-30 17:17:27 -0700 + Commit: e7a0976, github.com/apache/spark/pull/7803 + + [SPARK-7157][SQL] add sampleBy to DataFrame + Xiangrui Meng + 2015-07-30 17:16:03 -0700 + Commit: df32669, github.com/apache/spark/pull/7755 + + [SPARK-9408] [PYSPARK] [MLLIB] Refactor linalg.py to /linalg + Xiangrui Meng + 2015-07-30 16:57:38 -0700 + Commit: ca71cc8, github.com/apache/spark/pull/7731 + + [STREAMING] [TEST] [HOTFIX] Fixed Kinesis test to not throw weird errors when Kinesis tests are enabled without AWS keys + Tathagata Das + 2015-07-30 16:44:02 -0700 + Commit: 1afdeb7, github.com/apache/spark/pull/7809 + + [SPARK-9199] [CORE] Update Tachyon dependency from 0.6.4 -> 0.7.0 + Calvin Jia + 2015-07-30 16:32:40 -0700 + Commit: 04c8409, github.com/apache/spark/pull/7577 + + [SPARK-8742] [SPARKR] Improve SparkR error messages for DataFrame API + Hossein + 2015-07-30 16:16:17 -0700 + Commit: 157840d, github.com/apache/spark/pull/7742 + + [SPARK-9463] [ML] Expose model coefficients with names in SparkR RFormula + Eric Liang + 2015-07-30 16:15:43 -0700 + Commit: e7905a9, github.com/apache/spark/pull/7771 + + [SPARK-6684] [MLLIB] [ML] Add checkpointing to GBTs + Joseph K. Bradley + 2015-07-30 16:04:23 -0700 + Commit: be7be6d, github.com/apache/spark/pull/7804 + + [SPARK-8671] [ML] Added isotonic regression to the pipeline API. + martinzapletal + 2015-07-30 15:57:14 -0700 + Commit: 7f7a319, github.com/apache/spark/pull/7517 + + [SPARK-9479] [STREAMING] [TESTS] Fix ReceiverTrackerSuite failure for maven build and other potential test failures in Streaming + zsxwing + 2015-07-30 15:39:46 -0700 + Commit: 0dbd696, github.com/apache/spark/pull/7797 + + [SPARK-9454] Change LDASuite tests to use vector comparisons + Feynman Liang + 2015-07-30 14:08:59 -0700 + Commit: 89cda69, github.com/apache/spark/pull/7775 + + [SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation + Daoyuan Wang , Davies Liu + 2015-07-30 13:21:46 -0700 + Commit: 1abf7dc, github.com/apache/spark/pull/7589 + + [SPARK-5567] [MLLIB] Add predict method to LocalLDAModel + Feynman Liang + 2015-07-30 13:17:54 -0700 + Commit: d8cfd53, github.com/apache/spark/pull/7760 + + [SPARK-9460] Fix prefix generation for UTF8String. + Reynold Xin + 2015-07-30 13:09:43 -0700 + Commit: a20e743, github.com/apache/spark/pull/7789 + + [SPARK-8174] [SPARK-8175] [SQL] function unix_timestamp, from_unixtime + Daoyuan Wang + 2015-07-30 11:13:15 -0700 + Commit: 6d94bf6, github.com/apache/spark/pull/7644 + + [SPARK-9437] [CORE] avoid overflow in SizeEstimator + Imran Rashid + 2015-07-30 10:46:26 -0700 + Commit: 06b6a07, github.com/apache/spark/pull/7750 + + [SPARK-8850] [SQL] Enable Unsafe mode by default + Josh Rosen + 2015-07-30 10:45:32 -0700 + Commit: 520ec0f, github.com/apache/spark/pull/7564 + + [SPARK-9388] [YARN] Make executor info log messages easier to read. + Marcelo Vanzin + 2015-07-30 10:40:04 -0700 + Commit: ab78b1d, github.com/apache/spark/pull/7706 + + [SPARK-8297] [YARN] Scheduler backend is not notified in case node fails in YARN + Mridul Muralidharan , Marcelo Vanzin + 2015-07-30 10:37:53 -0700 + Commit: e535346, github.com/apache/spark/pull/7431 + + [SPARK-9361] [SQL] Refactor new aggregation code to reduce the times of checking compatibility + Liang-Chi Hsieh + 2015-07-30 10:30:37 -0700 + Commit: 5363ed7, github.com/apache/spark/pull/7677 + + [SPARK-9267] [CORE] Retire stringify(Partial)?Value from Accumulators + François Garillot + 2015-07-30 18:14:08 +0100 + Commit: 7bbf02f, github.com/apache/spark/pull/7678 + + [SPARK-9390][SQL] create a wrapper for array type + Wenchen Fan + 2015-07-30 10:04:30 -0700 + Commit: c0cc0ea, github.com/apache/spark/pull/7724 + + [SPARK-9248] [SPARKR] Closing curly-braces should always be on their own line + Yuu ISHIKAWA + 2015-07-30 10:00:27 -0700 + Commit: 7492a33, github.com/apache/spark/pull/7795 + + [MINOR] [MLLIB] fix doc for RegexTokenizer + Xiangrui Meng + 2015-07-30 09:45:17 -0700 + Commit: 81464f2, github.com/apache/spark/pull/7798 + + [SPARK-9277] [MLLIB] SparseVector constructor must throw an error when declared number of elements less than array length + Sean Owen + 2015-07-30 09:19:55 -0700 + Commit: ed3cb1d, github.com/apache/spark/pull/7794 + + [SPARK-9225] [MLLIB] LDASuite needs unit tests for empty documents + Meihua Wu + 2015-07-30 08:52:01 -0700 + Commit: a6e53a9, github.com/apache/spark/pull/7620 + + [SPARK-] [MLLIB] minor fix on tokenizer doc + Yuhao Yang + 2015-07-30 08:20:52 -0700 + Commit: 9c0501c, github.com/apache/spark/pull/7791 + + [SPARK-8998] [MLLIB] Distribute PrefixSpan computation for large projected databases + zhangjiajin , Feynman Liang , zhang jiajin + 2015-07-30 08:14:09 -0700 + Commit: d212a31, github.com/apache/spark/pull/7412 + + [SPARK-5561] [MLLIB] Generalized PeriodicCheckpointer for RDDs and Graphs + Joseph K. Bradley + 2015-07-30 07:56:15 -0700 + Commit: c581593, github.com/apache/spark/pull/7728 + + [SPARK-7368] [MLLIB] Add QR decomposition for RowMatrix + Yuhao Yang + 2015-07-30 07:49:10 -0700 + Commit: d31c618, github.com/apache/spark/pull/5909 + + [SPARK-8838] [SQL] Add config to enable/disable merging part-files when merging parquet schema + Liang-Chi Hsieh + 2015-07-30 17:45:30 +0800 + Commit: 6175d6c, github.com/apache/spark/pull/7238 + + Fix flaky HashedRelationSuite + Reynold Xin + 2015-07-30 01:21:39 -0700 + Commit: 5ba2d44, github.com/apache/spark/pull/7784 + + Revert "[SPARK-9458] Avoid object allocation in prefix generation." + Reynold Xin + 2015-07-30 01:04:24 -0700 + Commit: 4a8bb9d + + [SPARK-9335] [TESTS] Enable Kinesis tests only when files in extras/kinesis-asl are changed + zsxwing + 2015-07-30 00:46:36 -0700 + Commit: 76f2e39, github.com/apache/spark/pull/7711 + + [SPARK-8005][SQL] Input file name + Joseph Batchik + 2015-07-29 23:35:55 -0700 + Commit: 1221849, github.com/apache/spark/pull/7743 + + [SPARK-9428] [SQL] Add test cases for null inputs for expression unit tests + Yijie Shen + 2015-07-29 23:24:20 -0700 + Commit: e127ec3, github.com/apache/spark/pull/7748 + + HOTFIX: disable HashedRelationSuite. + Reynold Xin + 2015-07-29 22:51:06 -0700 + Commit: 712465b + + [SPARK-9116] [SQL] [PYSPARK] support Python only UDT in __main__ + Davies Liu + 2015-07-29 22:30:49 -0700 + Commit: e044705, github.com/apache/spark/pull/7453 + + Fix reference to self.names in StructType + Alex Angelini + 2015-07-29 22:25:38 -0700 + Commit: f5dd113, github.com/apache/spark/pull/7766 + + [SPARK-9462][SQL] Initialize nondeterministic expressions in code gen fallback mode. + Reynold Xin + 2015-07-29 21:24:47 -0700 + Commit: 27850af, github.com/apache/spark/pull/7767 + + [SPARK-9460] Avoid byte array allocation in StringPrefixComparator. + Reynold Xin + 2015-07-29 21:18:43 -0700 + Commit: 07fd7d3, github.com/apache/spark/pull/7765 + + [SPARK-9458] Avoid object allocation in prefix generation. + Reynold Xin + 2015-07-29 20:46:03 -0700 + Commit: 9514d87, github.com/apache/spark/pull/7763 + + [SPARK-9440] [MLLIB] Add hyperparameters to LocalLDAModel save/load + Feynman Liang + 2015-07-29 19:02:15 -0700 + Commit: a200e64, github.com/apache/spark/pull/7757 + + [SPARK-6129] [MLLIB] [DOCS] Added user guide for evaluation metrics + sethah + 2015-07-29 18:23:07 -0700 + Commit: 2a9fe4a, github.com/apache/spark/pull/7655 + + [SPARK-9016] [ML] make random forest classifiers implement classification trait + Holden Karau + 2015-07-29 18:18:29 -0700 + Commit: 37c2d19, github.com/apache/spark/pull/7432 + + [SPARK-8921] [MLLIB] Add @since tags to mllib.stat + Bimal Tandel + 2015-07-29 16:54:58 -0700 + Commit: 103d8cc, github.com/apache/spark/pull/7730 + + [SPARK-9448][SQL] GenerateUnsafeProjection should not share expressions across instances. + Reynold Xin + 2015-07-29 16:49:02 -0700 + Commit: 8650596, github.com/apache/spark/pull/7759 + + [SPARK-6793] [MLLIB] OnlineLDAOptimizer LDA perplexity + Feynman Liang + 2015-07-29 16:20:20 -0700 + Commit: 2cc212d, github.com/apache/spark/pull/7705 + + [SPARK-9411] [SQL] Make Tungsten page sizes configurable + Josh Rosen + 2015-07-29 16:00:30 -0700 + Commit: 1b0099f, github.com/apache/spark/pull/7741 + + [SPARK-9436] [GRAPHX] Pregel simplification patch + Alexander Ulanov + 2015-07-29 13:59:00 -0700 + Commit: b715933, github.com/apache/spark/pull/7749 + + [SPARK-9430][SQL] Rename IntervalType to CalendarIntervalType. + Reynold Xin + 2015-07-29 13:49:22 -0700 + Commit: 5340dfa, github.com/apache/spark/pull/7745 + + [SPARK-8977] [STREAMING] Defines the RateEstimator interface, and impements the RateController + Iulian Dragos , François Garillot + 2015-07-29 13:47:37 -0700 + Commit: 819be46, github.com/apache/spark/pull/7600 + + [SPARK-746] [CORE] Added Avro Serialization to Kryo + Joseph Batchik , Joseph Batchik + 2015-07-29 14:02:32 -0500 + Commit: 069a4c4, github.com/apache/spark/pull/7004 + + [SPARK-9127][SQL] Rand/Randn codegen fails with long seed. + Reynold Xin + 2015-07-29 09:36:22 -0700 + Commit: 9790694, github.com/apache/spark/pull/7747 + + [SPARK-9251][SQL] do not order by expressions which still need evaluation + Wenchen Fan + 2015-07-29 00:08:45 -0700 + Commit: 708794e, github.com/apache/spark/pull/7593 + + [SPARK-9281] [SQL] use decimal or double when parsing SQL + Davies Liu + 2015-07-28 22:51:08 -0700 + Commit: 15667a0, github.com/apache/spark/pull/7642 + + [SPARK-9398] [SQL] Datetime cleanup + Yijie Shen + 2015-07-28 22:38:28 -0700 + Commit: 6309b93, github.com/apache/spark/pull/7725 + + [SPARK-9419] ShuffleMemoryManager and MemoryStore should track memory on a per-task, not per-thread, basis + Josh Rosen + 2015-07-28 21:53:28 -0700 + Commit: ea49705, github.com/apache/spark/pull/7734 + + [SPARK-8608][SPARK-8609][SPARK-9083][SQL] reset mutable states of nondeterministic expression before evaluation and fix PullOutNondeterministic + Wenchen Fan + 2015-07-28 21:37:50 -0700 + Commit: 429b2f0, github.com/apache/spark/pull/7674 + + [SPARK-9422] [SQL] Remove the placeholder attributes used in the aggregation buffers + Yin Huai + 2015-07-28 19:01:25 -0700 + Commit: 3744b7f, github.com/apache/spark/pull/7737 + + [SPARK-9421] Fix null-handling bugs in UnsafeRow.getDouble, getFloat(), and get(ordinal, dataType) + Josh Rosen + 2015-07-28 17:51:58 -0700 + Commit: e78ec1a, github.com/apache/spark/pull/7736 + + [SPARK-9418][SQL] Use sort-merge join as the default shuffle join. + Reynold Xin + 2015-07-28 17:42:35 -0700 + Commit: 6662ee2, github.com/apache/spark/pull/7733 + + [SPARK-9420][SQL] Move expressions in sql/core package to catalyst. + Reynold Xin + 2015-07-28 17:03:59 -0700 + Commit: b7f5411, github.com/apache/spark/pull/7735 + + [STREAMING] [HOTFIX] Ignore ReceiverTrackerSuite flaky test + Tathagata Das + 2015-07-28 16:41:56 -0700 + Commit: c5ed369, github.com/apache/spark/pull/7738 + + [SPARK-9393] [SQL] Fix several error-handling bugs in ScriptTransform operator + Josh Rosen + 2015-07-28 16:04:48 -0700 + Commit: 59b92ad, github.com/apache/spark/pull/7710 + + [SPARK-9247] [SQL] Use BytesToBytesMap for broadcast join + Davies Liu + 2015-07-28 15:56:19 -0700 + Commit: 2182552, github.com/apache/spark/pull/7592 + + [SPARK-7105] [PYSPARK] [MLLIB] Support model save/load in GMM + MechCoder + 2015-07-28 15:00:25 -0700 + Commit: 198d181, github.com/apache/spark/pull/7617 + + [SPARK-8003][SQL] Added virtual column support to Spark + Joseph Batchik , JD + 2015-07-28 14:39:25 -0700 + Commit: b88b868, github.com/apache/spark/pull/7478 + + [SPARK-9391] [ML] Support minus, dot, and intercept operators in SparkR RFormula + Eric Liang + 2015-07-28 14:16:57 -0700 + Commit: 8d5bb52, github.com/apache/spark/pull/7707 + + [SPARK-9196] [SQL] Ignore test DatetimeExpressionsSuite: function current_timestamp. + Yin Huai + 2015-07-28 13:16:48 -0700 + Commit: 6cdcc21, github.com/apache/spark/pull/7727 + + [SPARK-9327] [DOCS] Fix documentation about classpath config options. + Marcelo Vanzin + 2015-07-28 11:48:56 -0700 + Commit: 31ec6a8, github.com/apache/spark/pull/7651 + + Use vector-friendly comparison for packages argument. + trestletech + 2015-07-28 10:45:19 -0700 + Commit: 6143234, github.com/apache/spark/pull/7701 + + [SPARK-9397] DataFrame should provide an API to find source data files if applicable + Aaron Davidson + 2015-07-28 10:12:09 -0700 + Commit: 35ef853, github.com/apache/spark/pull/7717 + + [SPARK-8196][SQL] Fix null handling & documentation for next_day. + Reynold Xin + 2015-07-28 09:43:39 -0700 + Commit: 9bbe017, github.com/apache/spark/pull/7718 + + [SPARK-9373][SQL] follow up for StructType support in Tungsten projection. + Reynold Xin + 2015-07-28 09:43:12 -0700 + Commit: c740bed, github.com/apache/spark/pull/7720 + + [SPARK-9402][SQL] Remove CodegenFallback from Abs / FormatNumber. + Reynold Xin + 2015-07-28 09:42:35 -0700 + Commit: 5a2330e, github.com/apache/spark/pull/7723 + + [SPARK-8919] [DOCUMENTATION, MLLIB] Added @since tags to mllib.recommendation + vinodkc + 2015-07-28 08:48:57 -0700 + Commit: 4af622c, github.com/apache/spark/pull/7325 + + [EC2] Cosmetic fix for usage of spark-ec2 --ebs-vol-num option + Kenichi Maehashi + 2015-07-28 15:57:21 +0100 + Commit: ac8c549, github.com/apache/spark/pull/7632 + + [SPARK-9394][SQL] Handle parentheses in CodeFormatter. + Reynold Xin + 2015-07-28 00:52:26 -0700 + Commit: 15724fa, github.com/apache/spark/pull/7712 + + Closes #6836 since Round has already been implemented. + Reynold Xin + 2015-07-27 23:56:16 -0700 + Commit: fc3bd96 + + [SPARK-9335] [STREAMING] [TESTS] Make sure the test stream is deleted in KinesisBackedBlockRDDSuite + zsxwing + 2015-07-27 23:34:29 -0700 + Commit: d93ab93, github.com/apache/spark/pull/7663 + + [MINOR] [SQL] Support mutable expression unit test with codegen projection + Cheng Hao + 2015-07-27 23:02:23 -0700 + Commit: 9c5612f, github.com/apache/spark/pull/7566 + + [SPARK-9373][SQL] Support StructType in Tungsten projection + Reynold Xin + 2015-07-27 22:51:15 -0700 + Commit: 60f08c7, github.com/apache/spark/pull/7689 + + [SPARK-8828] [SQL] Revert SPARK-5680 + Yijie Shen + 2015-07-27 22:47:31 -0700 + Commit: 63a492b, github.com/apache/spark/pull/7667 + + Fixed a test failure. + Reynold Xin + 2015-07-27 22:04:54 -0700 + Commit: 3bc7055 + + [SPARK-9395][SQL] Create a SpecializedGetters interface to track all the specialized getters. + Reynold Xin + 2015-07-27 21:41:15 -0700 + Commit: 84da879, github.com/apache/spark/pull/7713 + + [SPARK-8195] [SPARK-8196] [SQL] udf next_day last_day + Daoyuan Wang + 2015-07-27 21:08:56 -0700 + Commit: 2e7f99a, github.com/apache/spark/pull/6986 + + [SPARK-8882] [STREAMING] Add a new Receiver scheduling mechanism + zsxwing + 2015-07-27 17:59:43 -0700 + Commit: daa1964, github.com/apache/spark/pull/7276 + + [SPARK-9386] [SQL] Feature flag for metastore partition pruning + Michael Armbrust + 2015-07-27 17:32:34 -0700 + Commit: ce89ff4, github.com/apache/spark/pull/7703 + + [SPARK-9230] [ML] Support StringType features in RFormula + Eric Liang + 2015-07-27 17:17:49 -0700 + Commit: 8ddfa52, github.com/apache/spark/pull/7574 + + [SPARK-9385] [PYSPARK] Enable PEP8 but disable installing pylint. + Yin Huai + 2015-07-27 15:49:42 -0700 + Commit: dafe8d8, github.com/apache/spark/pull/7704 + + [SPARK-4352] [YARN] [WIP] Incorporate locality preferences in dynamic allocation requests + jerryshao + 2015-07-27 15:46:35 -0700 + Commit: ab62595, github.com/apache/spark/pull/6394 + + [SPARK-9385] [HOT-FIX] [PYSPARK] Comment out Python style check + Yin Huai + 2015-07-27 15:18:48 -0700 + Commit: 2104931, github.com/apache/spark/pull/7702 + + [SPARK-8988] [YARN] Make sure driver log links appear in secure cluste… + Hari Shreedharan + 2015-07-27 15:16:46 -0700 + Commit: c1be9f3, github.com/apache/spark/pull/7624 + + [SPARK-9355][SQL] Remove InternalRow.get generic getter call in columnar cache code + Wenchen Fan + 2015-07-27 13:40:50 -0700 + Commit: 3ab7525, github.com/apache/spark/pull/7673 + + [SPARK-9378] [SQL] Fixes test case "CTAS with serde" + Cheng Lian + 2015-07-27 13:28:03 -0700 + Commit: 8e7d2be, github.com/apache/spark/pull/7700 + + [SPARK-9349] [SQL] UDAF cleanup + Yin Huai + 2015-07-27 13:26:57 -0700 + Commit: 55946e7, github.com/apache/spark/pull/7687 + + Closes #7690 since it has been merged into branch-1.4. + Reynold Xin + 2015-07-27 13:21:04 -0700 + Commit: fa84e4a + + [HOTFIX] Disable pylint since it is failing master. + Reynold Xin + 2015-07-27 12:25:34 -0700 + Commit: 85a50a6 + + [SPARK-9369][SQL] Support IntervalType in UnsafeRow + Wenchen Fan + 2015-07-27 11:28:22 -0700 + Commit: 7543842, github.com/apache/spark/pull/7688 + + [SPARK-9351] [SQL] remove literals from grouping expressions in Aggregate + Wenchen Fan + 2015-07-27 11:23:29 -0700 + Commit: dd9ae79, github.com/apache/spark/pull/7583 + + [SPARK-7423] [MLLIB] Modify ClassificationModel and Probabalistic model to use Vector.argmax + George Dittmar + 2015-07-27 11:16:33 -0700 + Commit: 1f7b3d9, github.com/apache/spark/pull/7670 + + [SPARK-9376] [SQL] use a seed in RandomDataGeneratorSuite + Wenchen Fan + 2015-07-27 11:02:16 -0700 + Commit: e2f3816, github.com/apache/spark/pull/7691 + + [SPARK-9366] use task's stageAttemptId in TaskEnd event + Ryan Williams + 2015-07-27 12:54:08 -0500 + Commit: c0b7df6, github.com/apache/spark/pull/7681 + + [SPARK-9364] Fix array out of bounds and use-after-free bugs in UnsafeExternalSorter + Josh Rosen + 2015-07-27 09:34:49 -0700 + Commit: ecad9d4, github.com/apache/spark/pull/7680 + + Pregel example type fix + Alexander Ulanov + 2015-07-28 01:33:31 +0900 + Commit: 90006f3, github.com/apache/spark/pull/7695 + + [SPARK-4176] [SQL] Supports decimal types with precision > 18 in Parquet + Cheng Lian + 2015-07-27 23:29:40 +0800 + Commit: aa19c69, github.com/apache/spark/pull/7455 + + [SPARK-8405] [DOC] Add how to view logs on Web UI when yarn log aggregation is enabled + Carson Wang + 2015-07-27 08:02:40 -0500 + Commit: 6228381, github.com/apache/spark/pull/7463 + + [SPARK-7943] [SPARK-8105] [SPARK-8435] [SPARK-8714] [SPARK-8561] Fixes multi-database support + Cheng Lian + 2015-07-27 17:15:35 +0800 + Commit: 72981bc, github.com/apache/spark/pull/7623 + + [SPARK-9371][SQL] fix the support for special chars in column names for hive context + Wenchen Fan + 2015-07-26 23:58:03 -0700 + Commit: 4ffd3a1, github.com/apache/spark/pull/7684 + + [SPARK-9368][SQL] Support get(ordinal, dataType) generic getter in UnsafeRow. + Reynold Xin + 2015-07-26 23:01:04 -0700 + Commit: aa80c64, github.com/apache/spark/pull/7682 + + [SPARK-9306] [SQL] Don't use SortMergeJoin when joining on unsortable columns + Liang-Chi Hsieh + 2015-07-26 22:13:37 -0700 + Commit: 945d8bc, github.com/apache/spark/pull/7645 + + [SPARK-8867][SQL] Support list / describe function usage + Cheng Hao + 2015-07-26 18:34:19 -0700 + Commit: 1efe97d, github.com/apache/spark/pull/7259 + + [SPARK-9095] [SQL] Removes the old Parquet support + Cheng Lian + 2015-07-26 16:49:19 -0700 + Commit: c025c3d, github.com/apache/spark/pull/7441 + + [SPARK-9326] Close lock file used for file downloads. + Kay Ousterhout + 2015-07-26 13:35:16 -0700 + Commit: 6b2baec, github.com/apache/spark/pull/7650 + + [SPARK-9352] [SPARK-9353] Add tests for standalone scheduling code + Andrew Or + 2015-07-26 13:03:13 -0700 + Commit: 1cf1976, github.com/apache/spark/pull/7668 + + [SPARK-9356][SQL]Remove the internal use of DecimalType.Unlimited + Yijie Shen + 2015-07-26 10:29:22 -0700 + Commit: fb5d43f, github.com/apache/spark/pull/7671 + + [SPARK-9354][SQL] Remove InternalRow.get generic getter call in Hive integration code. + Reynold Xin + 2015-07-26 10:27:39 -0700 + Commit: 6c400b4, github.com/apache/spark/pull/7669 + + [SPARK-9337] [MLLIB] Add an ut for Word2Vec to verify the empty vocabulary check + Yuhao Yang + 2015-07-26 14:02:20 +0100 + Commit: b79bf1d, github.com/apache/spark/pull/7660 + + [SPARK-9350][SQL] Introduce an InternalRow generic getter that requires a DataType + Reynold Xin + 2015-07-25 23:52:37 -0700 + Commit: 4a01bfc, github.com/apache/spark/pull/7666 + + [SPARK-8881] [SPARK-9260] Fix algorithm for scheduling executors on workers + Nishkam Ravi , nishkamravi2 + 2015-07-25 22:56:25 -0700 + Commit: 41a7cdf, github.com/apache/spark/pull/7274 + + [SPARK-9348][SQL] Remove apply method on InternalRow. + Reynold Xin + 2015-07-25 18:41:51 -0700 + Commit: b1f4b4a, github.com/apache/spark/pull/7665 + + [SPARK-9192][SQL] add initialization phase for nondeterministic expression + Wenchen Fan + 2015-07-25 12:10:02 -0700 + Commit: 2c94d0f, github.com/apache/spark/pull/7535 + + [SPARK-9285] [SQL] Fixes Row/InternalRow conversion for HadoopFsRelation + Cheng Lian + 2015-07-25 11:42:49 -0700 + Commit: e2ec018, github.com/apache/spark/pull/7649 + + [SPARK-9304] [BUILD] Improve backwards compatibility of SPARK-8401 + Sean Owen + 2015-07-25 11:05:08 +0100 + Commit: c980e20, github.com/apache/spark/pull/7639 + + [SPARK-9334][SQL] Remove UnsafeRowConverter in favor of UnsafeProjection. + Reynold Xin + 2015-07-25 01:37:41 -0700 + Commit: 215713e, github.com/apache/spark/pull/7658 + + [SPARK-9336][SQL] Remove extra JoinedRows + Reynold Xin + 2015-07-25 01:28:46 -0700 + Commit: f0ebab3, github.com/apache/spark/pull/7659 + + [Spark-8668][SQL] Adding expr to functions + JD , Joseph Batchik + 2015-07-25 00:34:59 -0700 + Commit: 723db13, github.com/apache/spark/pull/7606 + + [HOTFIX] - Disable Kinesis tests due to rate limits + Patrick Wendell + 2015-07-24 22:57:01 -0700 + Commit: 19bcd6a + + [SPARK-9331][SQL] Add a code formatter to auto-format generated code. + Reynold Xin + 2015-07-24 19:35:24 -0700 + Commit: c84acd4, github.com/apache/spark/pull/7656 + + [SPARK-9330][SQL] Create specialized getStruct getter in InternalRow. + Reynold Xin + 2015-07-24 19:29:01 -0700 + Commit: f99cb56, github.com/apache/spark/pull/7654 + + [SPARK-7045] [MLLIB] Avoid intermediate representation when creating model + MechCoder + 2015-07-24 14:58:07 -0700 + Commit: a400ab5, github.com/apache/spark/pull/5748 + + [SPARK-9067] [SQL] Close reader in NewHadoopRDD early if there is no more data + Liang-Chi Hsieh + 2015-07-24 12:36:44 -0700 + Commit: 64135cb, github.com/apache/spark/pull/7424 + + [SPARK-9270] [PYSPARK] allow --name option in pyspark + Cheolsoo Park + 2015-07-24 11:56:55 -0700 + Commit: 9a11396, github.com/apache/spark/pull/7610 + + [SPARK-9261] [STREAMING] Avoid calling APIs that expose shaded classes. + Marcelo Vanzin + 2015-07-24 11:53:16 -0700 + Commit: 8399ba1, github.com/apache/spark/pull/7601 + + [SPARK-9295] Analysis should detect sorting on unsupported column types + Josh Rosen + 2015-07-24 11:34:23 -0700 + Commit: 6aceaf3, github.com/apache/spark/pull/7633 + + [SPARK-9222] [MLlib] Make class instantiation variables in DistributedLDAModel private[clustering] + MechCoder + 2015-07-24 10:56:48 -0700 + Commit: e253124, github.com/apache/spark/pull/7573 + + [SPARK-9292] Analysis should check that join conditions' data types are BooleanType + Josh Rosen + 2015-07-24 09:49:50 -0700 + Commit: c2b50d6, github.com/apache/spark/pull/7630 + + [SPARK-9305] Rename org.apache.spark.Row to Item. + Reynold Xin + 2015-07-24 09:38:13 -0700 + Commit: c8d71a4, github.com/apache/spark/pull/7638 + + [SPARK-9285][SQL] Remove InternalRow's inheritance from Row. + Reynold Xin + 2015-07-24 09:37:36 -0700 + Commit: 431ca39, github.com/apache/spark/pull/7626 + + [SPARK-9249] [SPARKR] local variable assigned but may not be used + Yu ISHIKAWA + 2015-07-24 09:10:11 -0700 + Commit: 3aec9f4, github.com/apache/spark/pull/7640 + + [SPARK-9250] Make change-scala-version more helpful w.r.t. valid Scala versions + François Garillot + 2015-07-24 17:09:33 +0100 + Commit: 428cde5, github.com/apache/spark/pull/7595 + + [SPARK-9238] [SQL] Remove two extra useless entries for bytesOfCodePointInUTF8 + zhichao.li + 2015-07-24 08:34:50 -0700 + Commit: 846cf46, github.com/apache/spark/pull/7582 + + [SPARK-9069] [SQL] follow up + Davies Liu + 2015-07-24 08:24:13 -0700 + Commit: dfb18be, github.com/apache/spark/pull/7634 + + [SPARK-9236] [CORE] Make defaultPartitioner not reuse a parent RDD's partitioner if it has 0 partitions + François Garillot + 2015-07-24 15:41:13 +0100 + Commit: 6cd28cc, github.com/apache/spark/pull/7616 + + [SPARK-8756] [SQL] Keep cached information and avoid re-calculating footers in ParquetRelation2 + Liang-Chi Hsieh + 2015-07-24 17:39:57 +0800 + Commit: 6a7e537, github.com/apache/spark/pull/7154 + + [build] Enable memory leak detection for Tungsten. + Reynold Xin + 2015-07-24 01:47:13 -0700 + Commit: 8fe32b4, github.com/apache/spark/pull/7637 + + [SPARK-9200][SQL] Don't implicitly cast non-atomic types to string type. + Reynold Xin + 2015-07-24 01:18:43 -0700 + Commit: cb8c241, github.com/apache/spark/pull/7636 + + [SPARK-9294][SQL] cleanup comments, code style, naming typo for the new aggregation + Wenchen Fan + 2015-07-23 23:40:01 -0700 + Commit: 408e64b, github.com/apache/spark/pull/7619 + + [SPARK-8092] [ML] Allow OneVsRest Classifier feature and label column names to be configurable. + Ram Sriharsha + 2015-07-23 22:35:41 -0700 + Commit: d4d762f, github.com/apache/spark/pull/6631 + + [SPARK-9216] [STREAMING] Define KinesisBackedBlockRDDs + Tathagata Das + 2015-07-23 20:06:54 -0700 + Commit: d249636, github.com/apache/spark/pull/7578 + + [SPARK-9122] [MLLIB] [PySpark] spark.mllib regression support batch predict + Yanbo Liang + 2015-07-23 18:53:07 -0700 + Commit: 52de3ac, github.com/apache/spark/pull/7614 + + [SPARK-9069] [SPARK-9264] [SQL] remove unlimited precision support for DecimalType + Davies Liu + 2015-07-23 18:31:13 -0700 + Commit: 8a94eb2, github.com/apache/spark/pull/7605 + + [SPARK-9207] [SQL] Enables Parquet filter push-down by default + Cheng Lian + 2015-07-23 17:49:33 -0700 + Commit: bebe3f7, github.com/apache/spark/pull/7612 + + [SPARK-9286] [SQL] Methods in Unevaluable should be final and AlgebraicAggregate should extend Unevaluable. + Josh Rosen + 2015-07-23 16:08:07 -0700 + Commit: b2f3aca, github.com/apache/spark/pull/7627 + + [SPARK-5447][SQL] Replace reference 'schema rdd' with DataFrame @rxin. + David Arroyo Cazorla + 2015-07-23 10:34:32 -0700 + Commit: 662d60d, github.com/apache/spark/pull/7618 + + [SPARK-9243] [Documentation] null -> zero in crosstab doc + Xiangrui Meng + 2015-07-23 10:32:11 -0700 + Commit: ecfb312, github.com/apache/spark/pull/7608 + + [SPARK-9183] confusing error message when looking up missing function in Spark SQL + Yijie Shen + 2015-07-23 10:31:12 -0700 + Commit: d2666a3, github.com/apache/spark/pull/7613 + + [Build][Minor] Fix building error & performance + Cheng Hao + 2015-07-23 10:28:20 -0700 + Commit: 19aeab5, github.com/apache/spark/pull/7611 + + [SPARK-9082] [SQL] [FOLLOW-UP] use `partition` in `PushPredicateThroughProject` + Wenchen Fan + 2015-07-23 09:37:53 -0700 + Commit: 52ef76d, github.com/apache/spark/pull/7607 + + [SPARK-9212] [CORE] upgrade Netty version to 4.0.29.Final + Zhang, Liye + 2015-07-23 12:43:54 +0100 + Commit: 26ed22a, github.com/apache/spark/pull/7562 + + Revert "[SPARK-8579] [SQL] support arbitrary object in UnsafeRow" + Reynold Xin + 2015-07-23 01:51:34 -0700 + Commit: fb36397, github.com/apache/spark/pull/7591 + + [SPARK-9266] Prevent "managed memory leak detected" exception from masking original exception + Josh Rosen + 2015-07-23 00:43:26 -0700 + Commit: ac3ae0f, github.com/apache/spark/pull/7603 + + [SPARK-8695] [CORE] [MLLIB] TreeAggregation shouldn't be triggered when it doesn't save wall-clock time. + Perinkulam I. Ganesh + 2015-07-23 07:46:20 +0100 + Commit: b983d49, github.com/apache/spark/pull/7397 + + [SPARK-8935] [SQL] Implement code generation for all casts + Yijie Shen + 2015-07-22 23:44:08 -0700 + Commit: 6d0d8b4, github.com/apache/spark/pull/7365 + + [SPARK-7254] [MLLIB] Run PowerIterationClustering directly on graph + Liang-Chi Hsieh , Liang-Chi Hsieh + 2015-07-22 23:29:26 -0700 + Commit: 825ab1e, github.com/apache/spark/pull/6054 + + [SPARK-9268] [ML] Removed varargs annotation from Params.setDefault taking multiple params + Joseph K. Bradley + 2015-07-22 23:27:25 -0700 + Commit: 410dd41, github.com/apache/spark/pull/7604 + + [SPARK-8364] [SPARKR] Add crosstab to SparkR DataFrames + Xiangrui Meng + 2015-07-22 21:40:23 -0700 + Commit: 2f5cbd8, github.com/apache/spark/pull/7318 + + [SPARK-9144] Remove DAGScheduler.runLocallyWithinThread and spark.localExecution.enabled + Josh Rosen , Reynold Xin + 2015-07-22 21:04:04 -0700 + Commit: b217230, github.com/apache/spark/pull/7585 + + [SPARK-9262][build] Treat Scala compiler warnings as errors + Reynold Xin , Eric Liang + 2015-07-22 21:02:19 -0700 + Commit: d71a13f, github.com/apache/spark/pull/7598 + + [SPARK-8484] [ML] Added TrainValidationSplit for hyper-parameter tuning. + martinzapletal + 2015-07-22 17:35:05 -0700 + Commit: a721ee5, github.com/apache/spark/pull/7337 + + [SPARK-9223] [PYSPARK] [MLLIB] Support model save/load in LDA + MechCoder + 2015-07-22 17:22:12 -0700 + Commit: 5307c9d, github.com/apache/spark/pull/7587 + + [SPARK-9180] fix spark-shell to accept --name option + Kenichi Maehashi + 2015-07-22 16:15:44 -0700 + Commit: 430cd78, github.com/apache/spark/pull/7512 + + [SPARK-8975] [STREAMING] Adds a mechanism to send a new rate from the driver to the block generator + Iulian Dragos , François Garillot + 2015-07-22 15:54:08 -0700 + Commit: 798dff7, github.com/apache/spark/pull/7471 + + [SPARK-9244] Increase some memory defaults + Matei Zaharia + 2015-07-22 15:28:09 -0700 + Commit: fe26584, github.com/apache/spark/pull/7586 + + [SPARK-8536] [MLLIB] Generalize OnlineLDAOptimizer to asymmetric document-topic Dirichlet priors + Feynman Liang + 2015-07-22 15:07:05 -0700 + Commit: 1aca9c1, github.com/apache/spark/pull/7575 + + [SPARK-4366] [SQL] [Follow-up] Fix SqlParser compiling warning. + Yin Huai + 2015-07-22 13:28:09 -0700 + Commit: cf21d05, github.com/apache/spark/pull/7588 + + [SPARK-9224] [MLLIB] OnlineLDA Performance Improvements + Feynman Liang + 2015-07-22 13:06:01 -0700 + Commit: 8486cd8, github.com/apache/spark/pull/7454 + + [SPARK-9024] Unsafe HashJoin/HashOuterJoin/HashSemiJoin + Davies Liu + 2015-07-22 13:02:43 -0700 + Commit: e0b7ba5, github.com/apache/spark/pull/7480 + + [SPARK-9165] [SQL] codegen for CreateArray, CreateStruct and CreateNamedStruct + Yijie Shen + 2015-07-22 12:19:59 -0700 + Commit: 86f80e2, github.com/apache/spark/pull/7537 + + [SPARK-9082] [SQL] Filter using non-deterministic expressions should not be pushed down + Wenchen Fan + 2015-07-22 11:45:51 -0700 + Commit: 7652095, github.com/apache/spark/pull/7446 + + [SPARK-9254] [BUILD] [HOTFIX] sbt-launch-lib.bash should support HTTP/HTTPS redirection + Cheng Lian + 2015-07-22 09:32:42 -0700 + Commit: b55a36b, github.com/apache/spark/pull/7597 + + [SPARK-4233] [SPARK-4367] [SPARK-3947] [SPARK-3056] [SQL] Aggregation Improvement + Yin Huai , Michael Armbrust + 2015-07-21 23:26:11 -0700 + Commit: c03299a, github.com/apache/spark/pull/7458 + + [SPARK-9232] [SQL] Duplicate code in JSONRelation + Andrew Or + 2015-07-21 23:00:13 -0700 + Commit: f4785f5, github.com/apache/spark/pull/7576 + + [SPARK-9121] [SPARKR] Get rid of the warnings about `no visible global function definition` in SparkR + Yu ISHIKAWA + 2015-07-21 22:50:27 -0700 + Commit: 63f4bcc, github.com/apache/spark/pull/7567 + + [SPARK-9154][SQL] Rename formatString to format_string. + Reynold Xin + 2015-07-21 19:14:07 -0700 + Commit: a4c83cb, github.com/apache/spark/pull/7579 + + [SPARK-9154] [SQL] codegen StringFormat + Tarek Auel + 2015-07-21 15:47:40 -0700 + Commit: d4c7a7a, github.com/apache/spark/pull/7571 + + [SPARK-9206] [SQL] Fix HiveContext classloading for GCS connector. + Dennis Huo + 2015-07-21 13:12:11 -0700 + Commit: c07838b, github.com/apache/spark/pull/7549 + + [SPARK-8906][SQL] Move all internal data source classes into execution.datasources. + Reynold Xin + 2015-07-21 11:56:38 -0700 + Commit: 60c0ce1, github.com/apache/spark/pull/7565 + + [SPARK-8357] Fix unsafe memory leak on empty inputs in GeneratedAggregate + navis.ryu , Josh Rosen + 2015-07-21 11:52:52 -0700 + Commit: 9ba7c64, github.com/apache/spark/pull/6810. + + Revert "[SPARK-9154] [SQL] codegen StringFormat" + Michael Armbrust + 2015-07-21 11:18:39 -0700 + Commit: 87d890c, github.com/apache/spark/pull/7570 + + [SPARK-5989] [MLLIB] Model save/load for LDA + MechCoder + 2015-07-21 10:31:31 -0700 + Commit: 89db3c0, github.com/apache/spark/pull/6948 + + [SPARK-9154] [SQL] codegen StringFormat + Tarek Auel + 2015-07-21 09:58:16 -0700 + Commit: 7f072c3, github.com/apache/spark/pull/7546 + + [SPARK-5423] [CORE] Register a TaskCompletionListener to make sure release all resources + zsxwing + 2015-07-21 09:55:42 -0700 + Commit: d45355e, github.com/apache/spark/pull/7529 + + [SPARK-4598] [WEBUI] Task table pagination for the Stage page + zsxwing + 2015-07-21 09:54:39 -0700 + Commit: 4f7f1ee, github.com/apache/spark/pull/7399 + + [SPARK-7171] Added a method to retrieve metrics sources in TaskContext + Jacek Lewandowski + 2015-07-21 09:53:33 -0700 + Commit: 3195491, github.com/apache/spark/pull/5805 + + [SPARK-9128] [CORE] Get outerclasses and objects with only one method calling in ClosureCleaner + Liang-Chi Hsieh + 2015-07-21 09:52:27 -0700 + Commit: 9a4fd87, github.com/apache/spark/pull/7459 + + [SPARK-9036] [CORE] SparkListenerExecutorMetricsUpdate messages not included in JsonProtocol + Ben + 2015-07-21 09:51:13 -0700 + Commit: f67da43, github.com/apache/spark/pull/7555 + + [SPARK-9193] Avoid assigning tasks to "lost" executor(s) + Grace + 2015-07-21 11:35:49 -0500 + Commit: 6592a60, github.com/apache/spark/pull/7528 + + [SPARK-8915] [DOCUMENTATION, MLLIB] Added @since tags to mllib.classification + petz2000 + 2015-07-21 08:50:43 -0700 + Commit: df4ddb3, github.com/apache/spark/pull/7371 + + [SPARK-9081] [SPARK-9168] [SQL] nanvl & dropna/fillna supporting nan as well + Yijie Shen + 2015-07-21 08:25:50 -0700 + Commit: be5c5d3, github.com/apache/spark/pull/7523 + + [SPARK-8401] [BUILD] Scala version switching build enhancements + Michael Allman + 2015-07-21 11:14:31 +0100 + Commit: f5b6dc5, github.com/apache/spark/pull/6832 + + [SPARK-8875] Remove BlockStoreShuffleFetcher class + Kay Ousterhout + 2015-07-21 01:12:51 -0700 + Commit: 6364735, github.com/apache/spark/pull/7268 + + [SPARK-9173][SQL]UnionPushDown should also support Intersect and Except + Yijie Shen + 2015-07-21 00:56:57 -0700 + Commit: ae23059, github.com/apache/spark/pull/7540 + + [SPARK-8230][SQL] Add array/map size method + Pedro Rodriguez , Pedro Rodriguez + 2015-07-21 00:53:20 -0700 + Commit: 560c658, github.com/apache/spark/pull/7462 + + [SPARK-8255] [SPARK-8256] [SQL] Add regex_extract/regex_replace + Cheng Hao + 2015-07-21 00:48:07 -0700 + Commit: 8c8f0ef, github.com/apache/spark/pull/7468 + + [SPARK-9100] [SQL] Adds DataFrame reader/writer shortcut methods for ORC + Cheng Lian + 2015-07-21 15:08:44 +0800 + Commit: d38c502, github.com/apache/spark/pull/7444 + + [SPARK-9161][SQL] codegen FormatNumber + Tarek Auel + 2015-07-20 23:33:07 -0700 + Commit: 1ddd0f2, github.com/apache/spark/pull/7545 + + [SPARK-9179] [BUILD] Use default primary author if unspecified + Shivaram Venkataraman + 2015-07-20 23:31:08 -0700 + Commit: 228ab65, github.com/apache/spark/pull/7558 + + [SPARK-9023] [SQL] Followup for #7456 (Efficiency improvements for UnsafeRows in Exchange) + Josh Rosen + 2015-07-20 23:28:35 -0700 + Commit: 48f8fd4, github.com/apache/spark/pull/7551 + + [SPARK-9208][SQL] Remove variant of DataFrame string functions that accept column names. + Reynold Xin + 2015-07-20 22:48:13 -0700 + Commit: 67570be, github.com/apache/spark/pull/7556 + + [SPARK-9157] [SQL] codegen substring + Tarek Auel + 2015-07-20 22:43:30 -0700 + Commit: 560b355, github.com/apache/spark/pull/7534 + + [SPARK-8797] [SPARK-9146] [SPARK-9145] [SPARK-9147] Support NaN ordering and equality comparisons in Spark SQL + Josh Rosen + 2015-07-20 22:38:05 -0700 + Commit: c032b0b, github.com/apache/spark/pull/7194 + + [SPARK-9204][ML] Add default params test for linearyregression suite + Holden Karau + 2015-07-20 22:15:10 -0700 + Commit: 4d97be9, github.com/apache/spark/pull/7553 + + [SPARK-9132][SPARK-9163][SQL] codegen conv + Tarek Auel + 2015-07-20 22:08:12 -0700 + Commit: a3c7a3c, github.com/apache/spark/pull/7552 + + [SPARK-9201] [ML] Initial integration of MLlib + SparkR using RFormula + Eric Liang + 2015-07-20 20:49:38 -0700 + Commit: 1cbdd89, github.com/apache/spark/pull/7483 + + [SPARK-9052] [SPARKR] Fix comments after curly braces + Yu ISHIKAWA + 2015-07-21 11:38:22 +0900 + Commit: 2bdf991, github.com/apache/spark/pull/7440 + + [SPARK-9164] [SQL] codegen hex/unhex + Tarek Auel + 2015-07-20 19:17:59 -0700 + Commit: 936a96c, github.com/apache/spark/pull/7548 + + [SPARK-9142][SQL] Removing unnecessary self types in expressions. + Reynold Xin + 2015-07-20 18:23:51 -0700 + Commit: e90543e, github.com/apache/spark/pull/7550 + + [SPARK-9156][SQL] codegen StringSplit + Tarek Auel + 2015-07-20 18:21:05 -0700 + Commit: 6853ac7, github.com/apache/spark/pull/7547 + + [SPARK-9178][SQL] Add an empty string constant to UTF8String + Tarek Auel + 2015-07-20 18:16:49 -0700 + Commit: 047ccc8, github.com/apache/spark/pull/7509 + + [SPARK-9187] [WEBUI] Timeline view may show negative value for running tasks + Carson Wang + 2015-07-20 18:08:59 -0700 + Commit: 66bb800, github.com/apache/spark/pull/7526 + + [SPARK-9175] [MLLIB] BLAS.gemm fails to update matrix C when alpha==0 and beta!=1 + Meihua Wu + 2015-07-20 17:03:46 -0700 + Commit: ff3c72d, github.com/apache/spark/pull/7503 + + [SPARK-9198] [MLLIB] [PYTHON] Fixed typo in pyspark sparsevector doc tests + Joseph K. Bradley + 2015-07-20 16:49:55 -0700 + Commit: a5d0581, github.com/apache/spark/pull/7541 + + [SPARK-8125] [SQL] Accelerates Parquet schema merging and partition discovery + Cheng Lian + 2015-07-20 16:42:43 -0700 + Commit: a1064df, github.com/apache/spark/pull/7396 + + [SPARK-9160][SQL] codegen encode, decode + Tarek Auel + 2015-07-20 16:11:56 -0700 + Commit: dac7dbf, github.com/apache/spark/pull/7543 + + [SPARK-9159][SQL] codegen ascii, base64, unbase64 + Tarek Auel + 2015-07-20 15:32:46 -0700 + Commit: c9db8ea, github.com/apache/spark/pull/7542 + + [SPARK-9155][SQL] codegen StringSpace + Tarek Auel + 2015-07-20 15:23:28 -0700 + Commit: 4863c11, github.com/apache/spark/pull/7531 + + [SPARK-6910] [SQL] Support for pushing predicates down to metastore for partition pruning + Cheolsoo Park , Cheng Lian , Michael Armbrust + 2015-07-20 15:12:06 -0700 + Commit: dde0e12, github.com/apache/spark/pull/7492 + + [SPARK-9114] [SQL] [PySpark] convert returned object from UDF into internal type + Davies Liu + 2015-07-20 12:14:47 -0700 + Commit: 9f913c4, github.com/apache/spark/pull/7450 + + [SPARK-9101] [PySpark] Add missing NullType + Mateusz Buśkiewicz + 2015-07-20 12:00:48 -0700 + Commit: 02181fb, github.com/apache/spark/pull/7499 + + [SPARK-8103][core] DAGScheduler should not submit multiple concurrent attempts for a stage + Imran Rashid , Kay Ousterhout , Imran Rashid + 2015-07-20 10:28:32 -0700 + Commit: 80e2568, github.com/apache/spark/pull/6750 + + [SQL] Remove space from DataFrame Scala/Java API. + Reynold Xin + 2015-07-20 09:43:25 -0700 + Commit: c6fe9b4, github.com/apache/spark/pull/7530 + + [SPARK-9186][SQL] make deterministic describing the tree rather than the expression + Wenchen Fan + 2015-07-20 09:42:18 -0700 + Commit: 04db58a, github.com/apache/spark/pull/7525 + + [SPARK-9177][SQL] Reuse of calendar object in WeekOfYear + Tarek Auel + 2015-07-20 09:41:25 -0700 + Commit: a15ecd0, github.com/apache/spark/pull/7516 + + [SPARK-9153][SQL] codegen StringLPad/StringRPad + Tarek Auel + 2015-07-20 09:35:45 -0700 + Commit: 5112b7f, github.com/apache/spark/pull/7527 + + [SPARK-8996] [MLLIB] [PYSPARK] Python API for Kolmogorov-Smirnov Test + MechCoder + 2015-07-20 09:00:01 -0700 + Commit: d0b4e93, github.com/apache/spark/pull/7430 + + [SPARK-7422] [MLLIB] Add argmax to Vector, SparseVector + George Dittmar , George , dittmarg , Xiangrui Meng + 2015-07-20 08:55:37 -0700 + Commit: 3f7de7d, github.com/apache/spark/pull/6112 + + [SPARK-9023] [SQL] Efficiency improvements for UnsafeRows in Exchange + Josh Rosen + 2015-07-19 23:41:28 -0700 + Commit: 79ec072, github.com/apache/spark/pull/7456 + + [SQL][DOC] Minor document fix in HadoopFsRelationProvider + Jacky Li , Jacky Li + 2015-07-19 23:19:17 -0700 + Commit: 972d890, github.com/apache/spark/pull/7524 + + Code review feedback for the previous patch. + Reynold Xin + 2015-07-19 22:45:56 -0700 + Commit: 5bdf16d + + [SPARK-9185][SQL] improve code gen for mutable states to support complex initialization + Wenchen Fan + 2015-07-19 22:42:44 -0700 + Commit: 930253e, github.com/apache/spark/pull/7521 + + [SPARK-9172][SQL] Make DecimalPrecision support for Intersect and Except + Liang-Chi Hsieh + 2015-07-19 20:53:18 -0700 + Commit: d743bec, github.com/apache/spark/pull/7511 + + [SPARK-9030] [STREAMING] [HOTFIX] Make sure that no attempts to create Kinesis streams is made when no enabled + Tathagata Das + 2015-07-19 20:34:30 -0700 + Commit: 93eb2ac, github.com/apache/spark/pull/7519 + + [SPARK-8241][SQL] string function: concat_ws. + Reynold Xin + 2015-07-19 16:48:47 -0700 + Commit: 163e3f1, github.com/apache/spark/pull/7504 + + [SPARK-8638] [SQL] Window Function Performance Improvements - Cleanup + Herman van Hovell + 2015-07-19 16:29:50 -0700 + Commit: 7a81245, github.com/apache/spark/pull/7513 + + [SPARK-9021] [PYSPARK] Change RDD.aggregate() to do reduce(mapPartitions()) instead of mapPartitions.fold() + Nicholas Hwang + 2015-07-19 10:30:28 -0700 + Commit: a803ac3, github.com/apache/spark/pull/7378 + + [HOTFIX] [SQL] Fixes compilation error introduced by PR #7506 + Cheng Lian + 2015-07-19 18:58:19 +0800 + Commit: 34ed82b, github.com/apache/spark/pull/7510 + + [SPARK-9179] [BUILD] Allows committers to specify primary author of the PR to be merged + Cheng Lian + 2015-07-19 17:37:25 +0800 + Commit: bc24289, github.com/apache/spark/pull/7508 + + [SQL] Make date/time functions more consistent with other database systems. + Reynold Xin + 2015-07-19 01:17:22 -0700 + Commit: 3427937, github.com/apache/spark/pull/7506 + + [SPARK-8199][SQL] follow up; revert change in test + Tarek Auel + 2015-07-19 01:16:01 -0700 + Commit: a53d13f, github.com/apache/spark/pull/7505 + + [SPARK-9094] [PARENT] Increased io.dropwizard.metrics from 3.1.0 to 3.1.2 + Carl Anders Düvel + 2015-07-19 09:14:55 +0100 + Commit: 344d156, github.com/apache/spark/pull/7493 + + [SPARK-9166][SQL][PYSPARK] Capture and hide IllegalArgumentException in Python API + Liang-Chi Hsieh + 2015-07-19 00:32:56 -0700 + Commit: 9b644c4, github.com/apache/spark/pull/7497 + + Closes #6775 since it is subsumbed by other patches. + Reynold Xin + 2015-07-18 23:47:40 -0700 + Commit: 89d1358 + + [SPARK-8638] [SQL] Window Function Performance Improvements + Herman van Hovell + 2015-07-18 23:44:38 -0700 + Commit: a9a0d0c, github.com/apache/spark/pull/7057 + + Fixed test cases. + Reynold Xin + 2015-07-18 22:50:34 -0700 + Commit: 04c1b49 + + [SPARK-8199][SPARK-8184][SPARK-8183][SPARK-8182][SPARK-8181][SPARK-8180][SPARK-8179][SPARK-8177][SPARK-8178][SPARK-9115][SQL] date functions + Tarek Auel , Tarek Auel + 2015-07-18 22:48:05 -0700 + Commit: 83b682b, github.com/apache/spark/pull/6981 + + [SPARK-8443][SQL] Split GenerateMutableProjection Codegen due to JVM Code Size Limits + Forest Fang + 2015-07-18 21:05:44 -0700 + Commit: 6cb6096, github.com/apache/spark/pull/7076 + + [SPARK-8278] Remove non-streaming JSON reader. + Reynold Xin + 2015-07-18 20:27:55 -0700 + Commit: 45d798c, github.com/apache/spark/pull/7501 + + [SPARK-9150][SQL] Create CodegenFallback and Unevaluable trait + Reynold Xin + 2015-07-18 18:18:19 -0700 + Commit: 9914b1b, github.com/apache/spark/pull/7487 + + [SPARK-9174][SQL] Add documentation for all public SQLConfs. + Reynold Xin + 2015-07-18 15:29:38 -0700 + Commit: e16a19a, github.com/apache/spark/pull/7500 + + [SPARK-8240][SQL] string function: concat + Reynold Xin + 2015-07-18 14:07:56 -0700 + Commit: 6e1e2eb, github.com/apache/spark/pull/7486 + + [SPARK-9055][SQL] WidenTypes should also support Intersect and Except + Yijie Shen + 2015-07-18 12:57:53 -0700 + Commit: 3d2134f, github.com/apache/spark/pull/7491 + + Closes #6122 + Reynold Xin + 2015-07-18 12:25:04 -0700 + Commit: cdc36ee + + [SPARK-9151][SQL] Implement code generation for Abs + Liang-Chi Hsieh + 2015-07-18 12:11:37 -0700 + Commit: 225de8d, github.com/apache/spark/pull/7498 + + [SPARK-9171][SQL] add and improve tests for nondeterministic expressions + Wenchen Fan + 2015-07-18 11:58:53 -0700 + Commit: 86c50bf, github.com/apache/spark/pull/7496 + + [SPARK-9167][SQL] use UTC Calendar in `stringToDate` + Wenchen Fan + 2015-07-18 11:25:16 -0700 + Commit: 692378c, github.com/apache/spark/pull/7488 + + [SPARK-9142][SQL] remove more self type in catalyst + Wenchen Fan + 2015-07-18 11:13:49 -0700 + Commit: 1b4ff05, github.com/apache/spark/pull/7495 + + [SPARK-9143] [SQL] Add planner rule for automatically inserting Unsafe <-> Safe row format converters + Josh Rosen + 2015-07-18 11:08:18 -0700 + Commit: b8aec6c, github.com/apache/spark/pull/7482 + + [SPARK-9169][SQL] Improve unit test coverage for null expressions. + Reynold Xin + 2015-07-18 11:06:46 -0700 + Commit: fba3f5b, github.com/apache/spark/pull/7490 + + [MLLIB] [DOC] Seed fix in mllib naive bayes example + Paweł Kozikowski + 2015-07-18 10:12:48 -0700 + Commit: b9ef7ac, github.com/apache/spark/pull/7477 + + [SPARK-9118] [ML] Implement IntArrayParam in mllib + Rekha Joshi , Joshi + 2015-07-17 20:02:05 -0700 + Commit: 1017908, github.com/apache/spark/pull/7481 + + [SPARK-7879] [MLLIB] KMeans API for spark.ml Pipelines + Yu ISHIKAWA + 2015-07-17 18:30:04 -0700 + Commit: 34a889d, github.com/apache/spark/pull/6756 + + [SPARK-8280][SPARK-8281][SQL]Handle NaN, null and Infinity in math + Yijie Shen + 2015-07-17 17:33:19 -0700 + Commit: 529a2c2, github.com/apache/spark/pull/7451 + + [SPARK-7026] [SQL] fix left semi join with equi key and non-equi condition + Daoyuan Wang + 2015-07-17 16:45:46 -0700 + Commit: 1707238, github.com/apache/spark/pull/5643 + + [SPARK-9030] [STREAMING] Add Kinesis.createStream unit tests that actual sends data + Tathagata Das + 2015-07-17 16:43:18 -0700 + Commit: b13ef77, github.com/apache/spark/pull/7413 + + [SPARK-9117] [SQL] fix BooleanSimplification in case-insensitive + Wenchen Fan + 2015-07-17 16:28:24 -0700 + Commit: bd903ee, github.com/apache/spark/pull/7452 + + [SPARK-9113] [SQL] enable analysis check code for self join + Wenchen Fan + 2015-07-17 16:03:33 -0700 + Commit: fd6b310, github.com/apache/spark/pull/7449 + + [SPARK-9080][SQL] add isNaN predicate expression + Yijie Shen + 2015-07-17 15:49:31 -0700 + Commit: 15fc2ff, github.com/apache/spark/pull/7464 + + [SPARK-9142] [SQL] Removing unnecessary self types in Catalyst. + Reynold Xin + 2015-07-17 15:02:13 -0700 + Commit: b2aa490, github.com/apache/spark/pull/7479 + + [SPARK-8593] [CORE] Sort app attempts by start time. + Joshi , Rekha Joshi + 2015-07-17 22:47:28 +0100 + Commit: 42d8a01, github.com/apache/spark/pull/7253 + + [SPARK-7127] [MLLIB] Adding broadcast of model before prediction for ensembles + Bryan Cutler + 2015-07-17 14:10:16 -0700 + Commit: 8b8be1f, github.com/apache/spark/pull/6300 + + [SPARK-8792] [ML] Add Python API for PCA transformer + Yanbo Liang + 2015-07-17 14:08:06 -0700 + Commit: 830666f, github.com/apache/spark/pull/7190 + + [SPARK-9090] [ML] Fix definition of residual in LinearRegressionSummary, EnsembleTestHelper, and SquaredError + Feynman Liang + 2015-07-17 14:00:53 -0700 + Commit: 6da1069, github.com/apache/spark/pull/7435 + + [SPARK-5681] [STREAMING] Move 'stopReceivers' to the event loop to resolve the race condition + zsxwing , Liang-Chi Hsieh + 2015-07-17 14:00:31 -0700 + Commit: ad0954f, github.com/apache/spark/pull/4467 + + [SPARK-9136] [SQL] fix several bugs in DateTimeUtils.stringToTimestamp + Wenchen Fan + 2015-07-17 13:57:31 -0700 + Commit: 074085d, github.com/apache/spark/pull/7473 + + [SPARK-8600] [ML] Naive Bayes API for spark.ml Pipelines + Yanbo Liang + 2015-07-17 13:55:17 -0700 + Commit: 9974642, github.com/apache/spark/pull/7284 + + [SPARK-9062] [ML] Change output type of Tokenizer to Array(String, true) + Yuhao Yang + 2015-07-17 13:43:19 -0700 + Commit: 806c579, github.com/apache/spark/pull/7414 + + [SPARK-9138] [MLLIB] fix Vectors.dense + Davies Liu + 2015-07-17 12:43:58 -0700 + Commit: f9a82a8, github.com/apache/spark/pull/7476 + + [SPARK-9109] [GRAPHX] Keep the cached edge in the graph + tien-dungle + 2015-07-17 12:11:32 -0700 + Commit: 587c315, github.com/apache/spark/pull/7469 + + [SPARK-8945][SQL] Add add and subtract expressions for IntervalType + Liang-Chi Hsieh + 2015-07-17 09:38:08 -0700 + Commit: eba6a1a, github.com/apache/spark/pull/7398 + + [SPARK-8209[SQL]Add function conv + zhichao.li + 2015-07-17 09:32:27 -0700 + Commit: 305e77c, github.com/apache/spark/pull/6872 + + [SPARK-9130][SQL] throw exception when check equality between external and internal row + Wenchen Fan + 2015-07-17 09:31:13 -0700 + Commit: 59d24c2, github.com/apache/spark/pull/7460 + + [MINOR] [ML] fix wrong annotation of RFormula.formula + Yanbo Liang + 2015-07-17 09:00:41 -0700 + Commit: 441e072, github.com/apache/spark/pull/7470 + + [SPARK-8851] [YARN] In Client mode, make sure the client logs in and updates tokens + Hari Shreedharan + 2015-07-17 09:38:08 -0500 + Commit: c043a3e, github.com/apache/spark/pull/7394 + + [SPARK-9022] [SQL] Generated projections for UnsafeRow + Davies Liu + 2015-07-17 01:27:14 -0700 + Commit: ec8973d, github.com/apache/spark/pull/7437 + + [SPARK-9093] [SPARKR] Fix single-quotes strings in SparkR + Yu ISHIKAWA + 2015-07-17 17:00:50 +0900 + Commit: 5a3c1ad, github.com/apache/spark/pull/7439 + + [SPARK-9102] [SQL] Improve project collapse with nondeterministic expressions + Wenchen Fan + 2015-07-17 00:59:15 -0700 + Commit: 3f6d28a, github.com/apache/spark/pull/7445 + + Added inline comment for the canEqual PR by @cloud-fan. + Reynold Xin + 2015-07-16 23:13:06 -0700 + Commit: 111c055 + + [SPARK-9126] [MLLIB] do not assert on time taken by Thread.sleep() + Xiangrui Meng + 2015-07-16 23:02:06 -0700 + Commit: 358e7bf, github.com/apache/spark/pull/7457 + + [SPARK-7131] [ML] Copy Decision Tree, Random Forest impl to spark.ml + Joseph K. Bradley + 2015-07-16 22:26:59 -0700 + Commit: 322d286, github.com/apache/spark/pull/7294 + + [SPARK-8899] [SQL] remove duplicated equals method for Row + Wenchen Fan + 2015-07-16 21:41:36 -0700 + Commit: f893955, github.com/apache/spark/pull/7291 + + [SPARK-8857][SPARK-8859][Core]Add an internal flag to Accumulable and send internal accumulator updates to the driver via heartbeats + zsxwing + 2015-07-16 21:09:09 -0700 + Commit: 812b63b, github.com/apache/spark/pull/7448 + + [SPARK-8119] HeartbeatReceiver should replace executors, not kill + Andrew Or + 2015-07-16 19:39:54 -0700 + Commit: 96aa334, github.com/apache/spark/pull/7107 + + [SPARK-6284] [MESOS] Add mesos role, principal and secret + Timothy Chen + 2015-07-16 19:36:45 -0700 + Commit: d86bbb4, github.com/apache/spark/pull/4960 + + [SPARK-8646] PySpark does not run on YARN if master not provided in command line + Lianhui Wang + 2015-07-16 19:31:14 -0700 + Commit: 49351c7, github.com/apache/spark/pull/7438 + + [SPARK-8644] Include call site in SparkException stack traces thrown by job failures + Aaron Davidson + 2015-07-16 18:14:45 -0700 + Commit: 57e9b13, github.com/apache/spark/pull/7028 + + [SPARK-6304] [STREAMING] Fix checkpointing doesn't retain driver port issue. + jerryshao , Saisai Shao + 2015-07-16 16:55:46 -0700 + Commit: 031d7d4, github.com/apache/spark/pull/5060 + + [SPARK-9085][SQL] Remove LeafNode, UnaryNode, BinaryNode from TreeNode. + Reynold Xin + 2015-07-16 13:58:39 -0700 + Commit: fec10f0, github.com/apache/spark/pull/7434 + + [SPARK-6941] [SQL] Provide a better error message to when inserting into RDD based table + Yijie Shen + 2015-07-16 10:52:09 -0700 + Commit: 43dac2c, github.com/apache/spark/pull/7342 + + [SPARK-9015] [BUILD] Clean project import in scala ide + Jan Prach + 2015-07-16 18:42:41 +0100 + Commit: b536d5d, github.com/apache/spark/pull/7375 + + [SPARK-8995] [SQL] cast date strings like '2015-01-01 12:15:31' to date + Tarek Auel , Tarek Auel + 2015-07-16 08:26:39 -0700 + Commit: 4ea6480, github.com/apache/spark/pull/7353 + + [SPARK-8893] Add runtime checks against non-positive number of partitions + Daniel Darabos + 2015-07-16 08:16:54 +0100 + Commit: 0115516, github.com/apache/spark/pull/7285 + + [SPARK-8807] [SPARKR] Add between operator in SparkR + Liang-Chi Hsieh + 2015-07-15 23:36:57 -0700 + Commit: 0a79533, github.com/apache/spark/pull/7356 + + [SPARK-8972] [SQL] Incorrect result for rollup + Cheng Hao + 2015-07-15 23:35:27 -0700 + Commit: e272123, github.com/apache/spark/pull/7343 + + [SPARK-9068][SQL] refactor the implicit type cast code + Wenchen Fan + 2015-07-15 22:27:39 -0700 + Commit: ba33096, github.com/apache/spark/pull/7420 + + [SPARK-8245][SQL] FormatNumber/Length Support for Expression + Cheng Hao + 2015-07-15 21:47:21 -0700 + Commit: 42dea3a, github.com/apache/spark/pull/7034 + + [SPARK-9060] [SQL] Revert SPARK-8359, SPARK-8800, and SPARK-8677 + Yin Huai + 2015-07-15 21:08:30 -0700 + Commit: 9c64a75, github.com/apache/spark/pull/7426 + + [SPARK-9018] [MLLIB] add stopwatches + Xiangrui Meng + 2015-07-15 21:02:42 -0700 + Commit: 73d92b0, github.com/apache/spark/pull/7415 + + [SPARK-8774] [ML] Add R model formula with basic support as a transformer + Eric Liang + 2015-07-15 20:33:06 -0700 + Commit: 6960a79, github.com/apache/spark/pull/7381 + + [SPARK-9086][SQL] Remove BinaryNode from TreeNode. + Reynold Xin + 2015-07-15 17:50:11 -0700 + Commit: b064519, github.com/apache/spark/pull/7433 + + [SPARK-9071][SQL] MonotonicallyIncreasingID and SparkPartitionID should be marked as nondeterministic. + Reynold Xin + 2015-07-15 14:52:02 -0700 + Commit: affbe32, github.com/apache/spark/pull/7428 + + [SPARK-8974] Catch exceptions in allocation schedule task. + KaiXinXiaoLei + 2015-07-15 22:31:10 +0100 + Commit: 674eb2a, github.com/apache/spark/pull/7352 + + [SPARK-6602][Core]Replace Akka Serialization with Spark Serializer + zsxwing + 2015-07-15 14:02:23 -0700 + Commit: b9a922e, github.com/apache/spark/pull/7159 + + [SPARK-9005] [MLLIB] Fix RegressionMetrics computation of explainedVariance + Feynman Liang + 2015-07-15 13:32:25 -0700 + Commit: 536533c, github.com/apache/spark/pull/7361 + + SPARK-9070 JavaDataFrameSuite teardown NPEs if setup failed + Steve Loughran + 2015-07-15 12:15:35 -0700 + Commit: ec9b621, github.com/apache/spark/pull/7425 + + [SPARK-7555] [DOCS] Add doc for elastic net in ml-guide and mllib-guide + Shuo Xiang + 2015-07-15 12:10:53 -0700 + Commit: 303c120, github.com/apache/spark/pull/6504 + + [Minor][SQL] Allow spaces in the beginning and ending of string for Interval + Liang-Chi Hsieh + 2015-07-15 10:46:22 -0700 + Commit: 9716a727, github.com/apache/spark/pull/7390 + + [SPARK-8221][SQL]Add pmod function + zhichao.li + 2015-07-15 10:43:38 -0700 + Commit: a938527, github.com/apache/spark/pull/6783 + + [SPARK-9020][SQL] Support mutable state in code gen expressions + Wenchen Fan + 2015-07-15 10:31:39 -0700 + Commit: fa4ec36, github.com/apache/spark/pull/7392 + + [SPARK-8840] [SPARKR] Add float coercion on SparkR + Liang-Chi Hsieh + 2015-07-15 09:48:33 -0700 + Commit: 6f69025, github.com/apache/spark/pull/7280 + + [SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint checks to PySpark + MechCoder + 2015-07-15 08:25:53 -0700 + Commit: 20bb10f, github.com/apache/spark/pull/7241 + + [SPARK-9012] [WEBUI] Escape Accumulators in the task table + zsxwing + 2015-07-15 17:30:57 +0900 + Commit: adb33d3, github.com/apache/spark/pull/7369 + + [HOTFIX][SQL] Unit test breaking. + Reynold Xin + 2015-07-15 00:12:21 -0700 + Commit: 14935d8 + + [SPARK-8997] [MLLIB] Performance improvements in LocalPrefixSpan + Feynman Liang , Feynman Liang , Xiangrui Meng + 2015-07-14 23:50:57 -0700 + Commit: 1bb8acc, github.com/apache/spark/pull/7360 + + [SPARK-8279][SQL]Add math function round + Yijie Shen + 2015-07-14 23:30:41 -0700 + Commit: f0e1297, github.com/apache/spark/pull/6938 + + [SPARK-8018] [MLLIB] KMeans should accept initial cluster centers as param + FlytxtRnD + 2015-07-14 23:29:02 -0700 + Commit: 3f6296f, github.com/apache/spark/pull/6737 + + [SPARK-6259] [MLLIB] Python API for LDA + Yu ISHIKAWA + 2015-07-14 23:27:42 -0700 + Commit: 4692769, github.com/apache/spark/pull/6791 + + Revert SPARK-6910 and SPARK-9027 + Michael Armbrust + 2015-07-14 22:57:39 -0700 + Commit: c6b1a9e, github.com/apache/spark/pull/7409 + + [SPARK-8993][SQL] More comprehensive type checking in expressions. + Reynold Xin + 2015-07-14 22:52:53 -0700 + Commit: f23a721, github.com/apache/spark/pull/7348 + + [SPARK-8808] [SPARKR] Fix assignments in SparkR. + Sun Rui + 2015-07-14 22:21:01 -0700 + Commit: f650a00, github.com/apache/spark/pull/7395 + + [HOTFIX] Adding new names to known contributors + Patrick Wendell + 2015-07-14 21:44:47 -0700 + Commit: 5572fd0 + + [SPARK-5523] [CORE] [STREAMING] Add a cache for hostname in TaskMetrics to decrease the memory usage and GC overhead + jerryshao , Saisai Shao + 2015-07-14 19:54:02 -0700 + Commit: bb870e7, github.com/apache/spark/pull/5064 + + [SPARK-8820] [STREAMING] Add a configuration to set checkpoint dir. + huangzhaowei + 2015-07-14 19:20:49 -0700 + Commit: f957796, github.com/apache/spark/pull/7218 + + [SPARK-9050] [SQL] Remove unused newOrdering argument from Exchange (cleanup after SPARK-8317) + Josh Rosen + 2015-07-14 18:55:34 -0700 + Commit: cc57d70, github.com/apache/spark/pull/7407 + + [SPARK-9045] Fix Scala 2.11 build break in UnsafeExternalRowSorter + Josh Rosen + 2015-07-14 17:21:48 -0700 + Commit: e965a79, github.com/apache/spark/pull/7405 + + [SPARK-8962] Add Scalastyle rule to ban direct use of Class.forName; fix existing uses + Josh Rosen + 2015-07-14 16:08:17 -0700 + Commit: 11e5c37, github.com/apache/spark/pull/7350 + + [SPARK-4362] [MLLIB] Make prediction probability available in NaiveBayesModel + Sean Owen + 2015-07-14 22:44:54 +0100 + Commit: 740b034, github.com/apache/spark/pull/7376 + + [SPARK-8800] [SQL] Fix inaccurate precision/scale of Decimal division operation + Liang-Chi Hsieh + 2015-07-14 14:19:27 -0700 + Commit: 4b5cfc9, github.com/apache/spark/pull/7212 + + [SPARK-4072] [CORE] Display Streaming blocks in Streaming UI + zsxwing + 2015-07-14 13:58:36 -0700 + Commit: fb1d06f, github.com/apache/spark/pull/6672 + + [SPARK-8718] [GRAPHX] Improve EdgePartition2D for non perfect square number of partitions + Andrew Ray + 2015-07-14 13:14:47 -0700 + Commit: 0a4071e, github.com/apache/spark/pull/7104 + + [SPARK-9031] Merge BlockObjectWriter and DiskBlockObject writer to remove abstract class + Josh Rosen + 2015-07-14 12:56:17 -0700 + Commit: d267c28, github.com/apache/spark/pull/7391 + + [SPARK-8911] Fix local mode endless heartbeats + Andrew Or + 2015-07-14 12:47:11 -0700 + Commit: 8fb3a65, github.com/apache/spark/pull/7382 + + [SPARK-8933] [BUILD] Provide a --force flag to build/mvn that always uses downloaded maven + Brennon York + 2015-07-14 11:43:26 -0700 + Commit: c4e98ff, github.com/apache/spark/pull/7374 + + [SPARK-9027] [SQL] Generalize metastore predicate pushdown + Michael Armbrust + 2015-07-14 11:22:09 -0700 + Commit: 37f2d96, github.com/apache/spark/pull/7386 + + [SPARK-9029] [SQL] shortcut CaseKeyWhen if key is null + Wenchen Fan + 2015-07-14 10:20:15 -0700 + Commit: 59d820a, github.com/apache/spark/pull/7389 + + [SPARK-6851] [SQL] function least/greatest follow up + Daoyuan Wang + 2015-07-14 01:09:33 -0700 + Commit: 257236c, github.com/apache/spark/pull/7387 + + [SPARK-9010] [DOCUMENTATION] Improve the Spark Configuration document about `spark.kryoserializer.buffer` + zhaishidan + 2015-07-14 08:54:30 +0100 + Commit: c1feebd, github.com/apache/spark/pull/7393 + + [SPARK-9001] Fixing errors in javadocs that lead to failed build/sbt doc + Joseph Gonzalez + 2015-07-14 00:32:29 -0700 + Commit: 20c1434, github.com/apache/spark/pull/7354 + + [SPARK-6910] [SQL] Support for pushing predicates down to metastore for partition pruning + Cheolsoo Park + 2015-07-13 19:45:10 -0700 + Commit: 408b384, github.com/apache/spark/pull/7216 + + [SPARK-8743] [STREAMING] Deregister Codahale metrics for streaming when StreamingContext is closed + Neelesh Srinivas Salian + 2015-07-13 15:46:51 -0700 + Commit: b7bcbe2, github.com/apache/spark/pull/7362 + + [SPARK-8533] [STREAMING] Upgrade Flume to 1.6.0 + Hari Shreedharan + 2015-07-13 14:15:31 -0700 + Commit: 0aed38e, github.com/apache/spark/pull/6939 + + [SPARK-8636] [SQL] Fix equalNullSafe comparison + Vinod K C + 2015-07-13 12:51:33 -0700 + Commit: 4c797f2, github.com/apache/spark/pull/7040 + + [SPARK-8991] [ML] Update SharedParamsCodeGen's Generated Documentation + Vinod K C + 2015-07-13 12:03:39 -0700 + Commit: 714fc55, github.com/apache/spark/pull/7367 + + [SPARK-8954] [BUILD] Remove unneeded deb repository from Dockerfile to fix build error in docker. + yongtang + 2015-07-13 12:01:23 -0700 + Commit: 5c41691, github.com/apache/spark/pull/7346 + + Revert "[SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint checks to PySpark" + Davies Liu + 2015-07-13 11:30:36 -0700 + Commit: 79c3582 + + [SPARK-8950] [WEBUI] Correct the calculation of SchedulerDelay in StagePage + Carson Wang + 2015-07-13 11:20:04 -0700 + Commit: 5ca26fb, github.com/apache/spark/pull/7319 + + [SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint checks to PySpark + MechCoder + 2015-07-13 09:47:53 -0700 + Commit: 9b62e93, github.com/apache/spark/pull/7241 + + [SPARK-6797] [SPARKR] Add support for YARN cluster mode. + Sun Rui + 2015-07-13 08:21:47 -0700 + Commit: 7f487c8, github.com/apache/spark/pull/6743 + + [SPARK-8596] Add module for rstudio link to spark + Vincent D. Warmerdam + 2015-07-13 08:15:54 -0700 + Commit: a5bc803, github.com/apache/spark/pull/7366 + + [SPARK-8944][SQL] Support casting between IntervalType and StringType + Wenchen Fan + 2015-07-13 00:49:39 -0700 + Commit: 6b89943, github.com/apache/spark/pull/7355 + + [SPARK-8203] [SPARK-8204] [SQL] conditional function: least/greatest + Daoyuan Wang + 2015-07-13 00:14:32 -0700 + Commit: 92540d2, github.com/apache/spark/pull/6851 + + [SPARK-9006] [PYSPARK] fix microsecond loss in Python 3 + Davies Liu + 2015-07-12 20:25:06 -0700 + Commit: 20b4743, github.com/apache/spark/pull/7363 + + [SPARK-8880] Fix confusing Stage.attemptId member variable + Kay Ousterhout + 2015-07-12 20:45:09 -0400 + Commit: 3009088, github.com/apache/spark/pull/7275 + + [SPARK-8970][SQL] remove unnecessary abstraction for ExtractValue + Wenchen Fan + 2015-07-10 23:25:11 -0700 + Commit: c472eb1, github.com/apache/spark/pull/7339 + + [SPARK-8994] [ML] tiny cleanups to Params, Pipeline + Joseph K. Bradley + 2015-07-10 21:25:09 -0700 + Commit: 0c5207c, github.com/apache/spark/pull/7349 + + [SPARK-6487] [MLLIB] Add sequential pattern mining algorithm PrefixSpan to Spark MLlib + zhangjiajin , zhang jiajin + 2015-07-10 21:11:46 -0700 + Commit: 7f6be1f, github.com/apache/spark/pull/7258 + + [SPARK-8598] [MLLIB] Implementation of 1-sample, two-sided, Kolmogorov Smirnov Test for RDDs + jose.cambronero + 2015-07-10 20:55:45 -0700 + Commit: 9c50757, github.com/apache/spark/pull/6994 + + [SPARK-7735] [PYSPARK] Raise Exception on non-zero exit from pipe commands + Scott Taylor + 2015-07-10 19:29:32 -0700 + Commit: 6e1c7e2, github.com/apache/spark/pull/6262 + + [SPARK-8961] [SQL] Makes BaseWriterContainer.outputWriterForRow accepts InternalRow instead of Row + Cheng Lian + 2015-07-10 18:15:36 -0700 + Commit: 3363088, github.com/apache/spark/pull/7331 + + add inline comment for python tests + Davies Liu + 2015-07-10 17:44:21 -0700 + Commit: b6fc0ad + + [SPARK-8990] [SQL] SPARK-8990 DataFrameReader.parquet() should respect user specified options + Cheng Lian + 2015-07-10 16:49:45 -0700 + Commit: 857e325, github.com/apache/spark/pull/7347 + + [SPARK-7078] [SPARK-7079] Binary processing sort for Spark SQL + Josh Rosen + 2015-07-10 16:44:51 -0700 + Commit: fb8807c, github.com/apache/spark/pull/6444 + + [SPARK-8923] [DOCUMENTATION, MLLIB] Add @since tags to mllib.fpm + rahulpalamuttam + 2015-07-10 16:07:31 -0700 + Commit: 0772026, github.com/apache/spark/pull/7341 + + [HOTFIX] fix flaky test in PySpark SQL + Davies Liu + 2015-07-10 13:05:23 -0700 + Commit: 05ac023, github.com/apache/spark/pull/7344 + + [SPARK-8675] Executors created by LocalBackend won't get the same classpath as other executor backends + Min Zhou + 2015-07-10 09:52:40 -0700 + Commit: c185f3a, github.com/apache/spark/pull/7091 + + [CORE] [MINOR] change the log level to info + Cheng Hao + 2015-07-10 09:50:46 -0700 + Commit: db6d57f, github.com/apache/spark/pull/7340 + + [SPARK-8958] Dynamic allocation: change cached timeout to infinity + Andrew Or + 2015-07-10 09:48:17 -0700 + Commit: 5dd45bd, github.com/apache/spark/pull/7329 + + [SPARK-7944] [SPARK-8013] Remove most of the Spark REPL fork for Scala 2.11 + Iulian Dragos + 2015-07-10 16:22:49 +0100 + Commit: 11e22b7, github.com/apache/spark/pull/6903 + + [SPARK-7977] [BUILD] Disallowing println + Jonathan Alter + 2015-07-10 11:34:01 +0100 + Commit: e14b545, github.com/apache/spark/pull/7093 + + [DOCS] Added important updateStateByKey details + Michael Vogiatzis + 2015-07-09 19:53:23 -0700 + Commit: d538919, github.com/apache/spark/pull/7229 + + [SPARK-8839] [SQL] ThriftServer2 will remove session and execution no matter it's finished or not. + huangzhaowei + 2015-07-09 19:31:31 -0700 + Commit: 1903641, github.com/apache/spark/pull/7239 + + [SPARK-8913] [ML] Simplify LogisticRegression suite to use Vector Vector comparision + Holden Karau + 2015-07-09 19:08:33 -0700 + Commit: 2727304, github.com/apache/spark/pull/7335 + + [SPARK-8852] [FLUME] Trim dependencies in flume assembly. + Marcelo Vanzin + 2015-07-09 18:23:06 -0700 + Commit: 0e78e40, github.com/apache/spark/pull/7247 + + [SPARK-8959] [SQL] [HOTFIX] Removes parquet-thrift and libthrift dependencies + Cheng Lian + 2015-07-09 17:09:16 -0700 + Commit: 2d45571, github.com/apache/spark/pull/7330 + + [SPARK-8538] [SPARK-8539] [ML] Linear Regression Training and Testing Results + Feynman Liang + 2015-07-09 16:21:21 -0700 + Commit: a0cc3e5, github.com/apache/spark/pull/7099 + + [SPARK-8963][ML] cleanup tests in linear regression suite + Holden Karau + 2015-07-09 15:49:30 -0700 + Commit: e29ce31, github.com/apache/spark/pull/7327 + + Closes #6837 Closes #7321 Closes #2634 Closes #4963 Closes #2137 + Xiangrui Meng + 2015-07-09 15:14:14 -0700 + Commit: 6916533 + + [SPARK-8865] [STREAMING] FIX BUG: check key in kafka params + guowei2 + 2015-07-09 15:01:53 -0700 + Commit: 8977003, github.com/apache/spark/pull/7254 + + [SPARK-7902] [SPARK-6289] [SPARK-8685] [SQL] [PYSPARK] Refactor of serialization for Python DataFrame + Davies Liu + 2015-07-09 14:43:38 -0700 + Commit: c9e2ef5, github.com/apache/spark/pull/7301 + + [SPARK-8389] [STREAMING] [PYSPARK] Expose KafkaRDDs offsetRange in Python + jerryshao + 2015-07-09 13:54:44 -0700 + Commit: 3ccebf3, github.com/apache/spark/pull/7185 + + [SPARK-8701] [STREAMING] [WEBUI] Add input metadata in the batch page + zsxwing + 2015-07-09 13:48:29 -0700 + Commit: 1f6b0b1, github.com/apache/spark/pull/7081 + + [SPARK-6287] [MESOS] Add dynamic allocation to the coarse-grained Mesos scheduler + Iulian Dragos + 2015-07-09 13:26:46 -0700 + Commit: c483059, github.com/apache/spark/pull/4984 + + [SPARK-2017] [UI] Stage page hangs with many tasks + Andrew Or + 2015-07-09 13:25:11 -0700 + Commit: ebdf585, github.com/apache/spark/pull/7296 + + [SPARK-7419] [STREAMING] [TESTS] Fix CheckpointSuite.recovery with file input stream + zsxwing + 2015-07-09 13:22:17 -0700 + Commit: 88bf430, github.com/apache/spark/pull/7323 + + [SPARK-8953] SPARK_EXECUTOR_CORES is not read in SparkSubmit + xutingjun + 2015-07-09 13:21:10 -0700 + Commit: 930fe95, github.com/apache/spark/pull/7322 + + [MINOR] [STREAMING] Fix log statements in ReceiverSupervisorImpl + Tathagata Das + 2015-07-09 13:19:36 -0700 + Commit: 7ce3b81, github.com/apache/spark/pull/7328 + + [SPARK-8247] [SPARK-8249] [SPARK-8252] [SPARK-8254] [SPARK-8257] [SPARK-8258] [SPARK-8259] [SPARK-8261] [SPARK-8262] [SPARK-8253] [SPARK-8260] [SPARK-8267] [SQL] Add String Expressions + Cheng Hao + 2015-07-09 11:11:34 -0700 + Commit: 0b0b9ce, github.com/apache/spark/pull/6762 + + [SPARK-8703] [ML] Add CountVectorizer as a ml transformer to convert document to words count vector + Yuhao Yang + 2015-07-09 10:26:38 -0700 + Commit: 0cd84c8, github.com/apache/spark/pull/7084 + + [SPARK-8863] [EC2] Check aws access key from aws credentials if there is no boto config + JPark + 2015-07-09 10:23:36 -0700 + Commit: c59e268, github.com/apache/spark/pull/7252 + + [SPARK-8938][SQL] Implement toString for Interval data type + Wenchen Fan + 2015-07-09 10:04:42 -0700 + Commit: f6c0bd5, github.com/apache/spark/pull/7315 + + [SPARK-8926][SQL] Code review followup. + Reynold Xin + 2015-07-09 10:01:33 -0700 + Commit: a870a82, github.com/apache/spark/pull/7313 + + [SPARK-8948][SQL] Remove ExtractValueWithOrdinal abstract class + Reynold Xin + 2015-07-09 10:01:01 -0700 + Commit: e204d22, github.com/apache/spark/pull/7316 + + [SPARK-8940] [SPARKR] Don't overwrite given schema in createDataFrame + Liang-Chi Hsieh + 2015-07-09 09:57:12 -0700 + Commit: 59cc389, github.com/apache/spark/pull/7311 + + [SPARK-8830] [SQL] native levenshtein distance + Tarek Auel + 2015-07-09 09:22:24 -0700 + Commit: a1964e9, github.com/apache/spark/pull/7236 + + [SPARK-8931] [SQL] Fallback to interpreted evaluation if failed to compile in codegen + Davies Liu + 2015-07-09 09:20:16 -0700 + Commit: 23448a9, github.com/apache/spark/pull/7309 + + [SPARK-6266] [MLLIB] PySpark SparseVector missing doc for size, indices, values + lewuathe + 2015-07-09 08:16:26 -0700 + Commit: f88b125, github.com/apache/spark/pull/7290 + + [SPARK-8942][SQL] use double not decimal when cast double and float to timestamp + Wenchen Fan + 2015-07-09 00:26:25 -0700 + Commit: 09cb0d9, github.com/apache/spark/pull/7312 + + [SPARK-8928] [SQL] Makes CatalystSchemaConverter sticking to 1.4.x- when handling Parquet LISTs in compatible mode + Weizhong Lin + 2015-07-08 22:18:39 -0700 + Commit: 851e247, github.com/apache/spark/pull/7314 + + Revert "[SPARK-8928] [SQL] Makes CatalystSchemaConverter sticking to 1.4.x- when handling Parquet LISTs in compatible mode" + Cheng Lian + 2015-07-08 22:14:38 -0700 + Commit: c056484 + + [SPARK-8928] [SQL] Makes CatalystSchemaConverter sticking to 1.4.x- when handling Parquet LISTs in compatible mode + Weizhong Lin + 2015-07-08 22:09:12 -0700 + Commit: 3dab0da, github.com/apache/spark/pull/7304 + + Closes #7310. + Reynold Xin + 2015-07-08 22:08:50 -0700 + Commit: a240bf3 + + [SPARK-8926][SQL] Good errors for ExpectsInputType expressions + Michael Armbrust + 2015-07-08 22:05:58 -0700 + Commit: 768907e, github.com/apache/spark/pull/7303 + + [SPARK-8937] [TEST] A setting `spark.unsafe.exceptionOnMemoryLeak ` is missing in ScalaTest config. + Kousuke Saruta + 2015-07-09 13:28:17 +0900 + Commit: aba5784, github.com/apache/spark/pull/7308 + + [SPARK-8910] Fix MiMa flaky due to port contention issue + Andrew Or + 2015-07-08 20:29:08 -0700 + Commit: 47ef423, github.com/apache/spark/pull/7300 + + [SPARK-8932] Support copy() for UnsafeRows that do not use ObjectPools + Josh Rosen + 2015-07-08 20:28:05 -0700 + Commit: b55499a, github.com/apache/spark/pull/7306 + + [SPARK-8866][SQL] use 1us precision for timestamp type + Yijie Shen + 2015-07-08 20:20:17 -0700 + Commit: a290814, github.com/apache/spark/pull/7283 + + [SPARK-8927] [DOCS] Format wrong for some config descriptions + Jonathan Alter + 2015-07-09 03:28:51 +0100 + Commit: 28fa01e, github.com/apache/spark/pull/7292 + + [SPARK-8450] [SQL] [PYSARK] cleanup type converter for Python DataFrame + Davies Liu + 2015-07-08 18:22:53 -0700 + Commit: 74d8d3d, github.com/apache/spark/pull/7106 + + [SPARK-8914][SQL] Remove RDDApi + Kousuke Saruta + 2015-07-08 18:09:39 -0700 + Commit: 2a4f88b, github.com/apache/spark/pull/7302 + + [SPARK-5016] [MLLIB] Distribute GMM mixture components to executors + Feynman Liang + 2015-07-08 16:32:00 -0700 + Commit: f472b8c, github.com/apache/spark/pull/7166 + + [SPARK-8877] [MLLIB] Public API for association rule generation + Feynman Liang + 2015-07-08 16:27:11 -0700 + Commit: 8c32b2e, github.com/apache/spark/pull/7271 + + [SPARK-8068] [MLLIB] Add confusionMatrix method at class MulticlassMetrics in pyspark/mllib + Yanbo Liang + 2015-07-08 16:21:28 -0700 + Commit: 381cb16, github.com/apache/spark/pull/7286 + + [SPARK-6123] [SPARK-6775] [SPARK-6776] [SQL] Refactors Parquet read path for interoperability and backwards-compatibility + Cheng Lian + 2015-07-08 15:51:01 -0700 + Commit: 4ffc27c, github.com/apache/spark/pull/7231 + + [SPARK-8902] Correctly print hostname in error + Daniel Darabos + 2015-07-09 07:34:02 +0900 + Commit: 5687f76, github.com/apache/spark/pull/7288 + + [SPARK-8700][ML] Disable feature scaling in Logistic Regression + DB Tsai + 2015-07-08 15:21:58 -0700 + Commit: 5722193, github.com/apache/spark/pull/7080 + + [SPARK-8908] [SQL] Add () to distinct definition in dataframe + Cheolsoo Park + 2015-07-08 15:18:24 -0700 + Commit: 00b265f, github.com/apache/spark/pull/7298 + + [SPARK-8909][Documentation] Change the scala example in sql-programmi… + Alok Singh <“singhal@us.ibm.com”> + 2015-07-08 14:51:18 -0700 + Commit: 8f3cd93, github.com/apache/spark/pull/7299 + + [SPARK-8457] [ML] NGram Documentation + Feynman Liang + 2015-07-08 14:49:52 -0700 + Commit: c5532e2, github.com/apache/spark/pull/7244 + + [SPARK-8783] [SQL] CTAS with WITH clause does not work + Keuntae Park + 2015-07-08 14:29:52 -0700 + Commit: f031543, github.com/apache/spark/pull/7180 + + [SPARK-7785] [MLLIB] [PYSPARK] Add __str__ and __repr__ to Matrices + MechCoder + 2015-07-08 13:19:27 -0700 + Commit: 2b40365, github.com/apache/spark/pull/6342 + + [SPARK-8900] [SPARKR] Fix sparkPackages in init documentation + Shivaram Venkataraman + 2015-07-08 12:39:32 -0700 + Commit: 374c8a8, github.com/apache/spark/pull/7293 + + [SPARK-8657] [YARN] Fail to upload resource to viewfs + Tao Li + 2015-07-08 19:02:24 +0100 + Commit: 26d9b6b, github.com/apache/spark/pull/7125 + + [SPARK-8888][SQL] Use java.util.HashMap in DynamicPartitionWriterContainer. + Reynold Xin + 2015-07-08 10:56:31 -0700 + Commit: f61c989, github.com/apache/spark/pull/7282 + + [SPARK-8753][SQL] Create an IntervalType data type + Wenchen Fan + 2015-07-08 10:51:32 -0700 + Commit: 0ba98c0, github.com/apache/spark/pull/7226 + + [SPARK-5707] [SQL] fix serialization of generated projection + Davies Liu + 2015-07-08 10:43:00 -0700 + Commit: 74335b3, github.com/apache/spark/pull/7272 + + [SPARK-6912] [SQL] Throw an AnalysisException when unsupported Java Map types used in Hive UDF + Takeshi YAMAMURO + 2015-07-08 10:33:27 -0700 + Commit: 3e831a2, github.com/apache/spark/pull/7257 + + [SPARK-8785] [SQL] Improve Parquet schema merging + Liang-Chi Hsieh , Liang-Chi Hsieh + 2015-07-08 10:09:50 -0700 + Commit: 6722aca, github.com/apache/spark/pull/7182 + + [SPARK-8894] [SPARKR] [DOC] Example code errors in SparkR documentation. + Sun Rui + 2015-07-08 09:48:16 -0700 + Commit: bf02e37, github.com/apache/spark/pull/7287 + + [SPARK-8872] [MLLIB] added verification results from R for FPGrowthSuite + Kashif Rasul + 2015-07-08 08:44:58 -0700 + Commit: 3bb2177, github.com/apache/spark/pull/7269 + + [SPARK-7050] [BUILD] Fix Python Kafka test assembly jar not found issue under Maven build + jerryshao + 2015-07-08 12:23:32 +0100 + Commit: 8a9d9cc, github.com/apache/spark/pull/5632 + + [SPARK-8883][SQL]Remove the OverrideFunctionRegistry + Cheng Hao + 2015-07-08 00:10:24 -0700 + Commit: 351a36d, github.com/apache/spark/pull/7260 + + [SPARK-8886][Documentation]python Style update + Tijo Thomas + 2015-07-07 22:35:39 -0700 + Commit: 08192a1, github.com/apache/spark/pull/7281 + + [SPARK-8879][SQL] Remove EmptyRow class. + Reynold Xin + 2015-07-07 22:12:46 -0700 + Commit: 61c3cf7, github.com/apache/spark/pull/7277 + + [SPARK-8878][SQL] Improve unit test coverage for bitwise expressions. + Reynold Xin + 2015-07-07 19:12:40 -0700 + Commit: 5d603df, github.com/apache/spark/pull/7273 + + [SPARK-8868] SqlSerializer2 can go into infinite loop when row consists only of NullType columns + Yin Huai + 2015-07-07 18:36:35 -0700 + Commit: 68a4a16, github.com/apache/spark/pull/7262 + + [SPARK-7190] [SPARK-8804] [SPARK-7815] [SQL] unsafe UTF8String + Davies Liu + 2015-07-07 17:57:17 -0700 + Commit: 4ca9093, github.com/apache/spark/pull/7197 + + [SPARK-8876][SQL] Remove InternalRow type alias in expressions package. + Reynold Xin + 2015-07-07 17:40:14 -0700 + Commit: 770ff10, github.com/apache/spark/pull/7270 + + [SPARK-8794] [SQL] Make PrunedScan work for Sample + Liang-Chi Hsieh , Liang-Chi Hsieh + 2015-07-07 15:49:22 -0700 + Commit: da56c4e, github.com/apache/spark/pull/7228 + + [SPARK-8845] [ML] ML use of Breeze optimization: use adjustedValue instead of value + DB Tsai + 2015-07-07 15:46:44 -0700 + Commit: 3bf20c2, github.com/apache/spark/pull/7245 + + [SPARK-8704] [ML] [PySpark] Add missing methods in StandardScaler + MechCoder + 2015-07-07 12:35:40 -0700 + Commit: 35d781e, github.com/apache/spark/pull/7086 + + [SPARK-8559] [MLLIB] Support Association Rule Generation + Feynman Liang + 2015-07-07 11:34:30 -0700 + Commit: 3336c7b, github.com/apache/spark/pull/7005 + + [SPARK-8821] [EC2] Switched to binary mode for file reading + Simon Hafner + 2015-07-07 09:42:59 -0700 + Commit: 70beb80, github.com/apache/spark/pull/7215 + + [SPARK-8823] [MLLIB] [PYSPARK] Optimizations for SparseVector dot products + MechCoder + 2015-07-07 08:59:52 -0700 + Commit: 738c107, github.com/apache/spark/pull/7222 + + [SPARK-8711] [ML] Add additional methods to PySpark ML tree models + MechCoder + 2015-07-07 08:58:08 -0700 + Commit: 1dbc4a1, github.com/apache/spark/pull/7095 + + [SPARK-8570] [MLLIB] [DOCS] Improve MLlib Local Matrix Documentation. + Mike Dusenberry + 2015-07-07 08:24:52 -0700 + Commit: 0a63d7a, github.com/apache/spark/pull/6958 + + [SPARK-8788] [ML] Add Java unit test for PCA transformer + Yanbo Liang + 2015-07-07 08:19:17 -0700 + Commit: d73bc08, github.com/apache/spark/pull/7184 + + [SPARK-6731] [CORE] Addendum: Upgrade Apache commons-math3 to 3.4.1 + Sean Owen + 2015-07-07 08:09:56 -0700 + Commit: dcbd85b, github.com/apache/spark/pull/7261 + + [HOTFIX] Rename release-profile to release + Patrick Wendell + 2015-07-06 22:14:24 -0700 + Commit: 1cb2629 + + [SPARK-8759][SQL] add default eval to binary and unary expression according to default behavior of nullable + Wenchen Fan + 2015-07-06 22:13:50 -0700 + Commit: c46aaf4, github.com/apache/spark/pull/7157 + + [SPARK-5562] [MLLIB] LDA should handle empty document. + Alok Singh , Alok Singh , Alok Singh <“singhal@us.ibm.com”> + 2015-07-06 21:53:55 -0700 + Commit: 6718c1e, github.com/apache/spark/pull/7064 + + [SPARK-6747] [SQL] Throw an AnalysisException when unsupported Java list types used in Hive UDF + Takeshi YAMAMURO + 2015-07-06 19:44:31 -0700 + Commit: 1821fc1, github.com/apache/spark/pull/7248 + + Revert "[SPARK-8781] Fix variables in published pom.xml are not resolved" + Andrew Or + 2015-07-06 19:27:04 -0700 + Commit: 929dfa2 + + [SPARK-8819] Fix build for maven 3.3.x + Andrew Or + 2015-07-06 19:22:30 -0700 + Commit: 9eae5fa, github.com/apache/spark/pull/7219 + + [SPARK-8463][SQL] Use DriverRegistry to load jdbc driver at writing path + Liang-Chi Hsieh + 2015-07-06 17:16:44 -0700 + Commit: d4d6d31, github.com/apache/spark/pull/6900 + + [SPARK-8072] [SQL] Better AnalysisException for writing DataFrame with identically named columns + animesh + 2015-07-06 16:39:49 -0700 + Commit: 09a0641, github.com/apache/spark/pull/7013 + + [SPARK-8588] [SQL] Regression test + Yin Huai + 2015-07-06 16:26:31 -0700 + Commit: 7b467cc, github.com/apache/spark/pull/7103 + + [SPARK-8765] [MLLIB] Fix PySpark PowerIterationClustering test issue + Yanbo Liang + 2015-07-06 16:15:12 -0700 + Commit: 0effe18, github.com/apache/spark/pull/7177 + + Revert "[SPARK-7212] [MLLIB] Add sequence learning flag" + Xiangrui Meng + 2015-07-06 16:11:22 -0700 + Commit: 96c5eee, github.com/apache/spark/pull/7240 + + [SPARK-6707] [CORE] [MESOS] Mesos Scheduler should allow the user to specify constraints based on slave attributes + Ankur Chauhan + 2015-07-06 16:04:57 -0700 + Commit: 1165b17, github.com/apache/spark/pull/5563 + + [SPARK-8656] [WEBUI] Fix the webUI and JSON API number is not synced + Wisely Chen + 2015-07-06 16:04:01 -0700 + Commit: 9ff2033, github.com/apache/spark/pull/7038 + + [MINOR] [SQL] remove unused code in Exchange + Daoyuan Wang + 2015-07-06 15:54:43 -0700 + Commit: 132e7fc, github.com/apache/spark/pull/7234 + + [SPARK-4485] [SQL] 1) Add broadcast hash outer join, (2) Fix SparkPlanTest + kai + 2015-07-06 14:33:30 -0700 + Commit: 2471c0b, github.com/apache/spark/pull/7162 + + [SPARK-8784] [SQL] Add Python API for hex and unhex + Davies Liu + 2015-07-06 13:31:31 -0700 + Commit: 37e4d92, github.com/apache/spark/pull/7223 + + Small update in the readme file + Dirceu Semighini Filho + 2015-07-06 13:28:07 -0700 + Commit: 57c72fc, github.com/apache/spark/pull/7242 + + [SPARK-8837][SPARK-7114][SQL] support using keyword in column name + Wenchen Fan + 2015-07-06 13:26:46 -0700 + Commit: 0e19464, github.com/apache/spark/pull/7237 + + [SPARK-8124] [SPARKR] Created more examples on SparkR DataFrames + Daniel Emaasit (PhD Student) + 2015-07-06 10:36:02 -0700 + Commit: 293225e, github.com/apache/spark/pull/6668 + + [SPARK-8841] [SQL] Fix partition pruning percentage log message + Steve Lindemann + 2015-07-06 10:17:05 -0700 + Commit: 39e4e7e, github.com/apache/spark/pull/7227 + + [SPARK-8831][SQL] Support AbstractDataType in TypeCollection. + Reynold Xin + 2015-07-05 23:54:25 -0700 + Commit: 86768b7, github.com/apache/spark/pull/7232 + + [SQL][Minor] Update the DataFrame API for encode/decode + Cheng Hao + 2015-07-05 21:50:52 -0700 + Commit: 6d0411b, github.com/apache/spark/pull/7230 + + [SPARK-8549] [SPARKR] Fix the line length of SparkR + Yu ISHIKAWA + 2015-07-05 20:50:02 -0700 + Commit: a0cb111, github.com/apache/spark/pull/7204 + + [SPARK-7137] [ML] Update SchemaUtils checkInputColumn to print more info if needed + Joshi , Rekha Joshi + 2015-07-05 12:58:03 -0700 + Commit: f9c448d, github.com/apache/spark/pull/5992 + + [MINOR] [SQL] Minor fix for CatalystSchemaConverter + Liang-Chi Hsieh + 2015-07-04 22:52:50 -0700 + Commit: 2b820f2, github.com/apache/spark/pull/7224 + + [SPARK-8822][SQL] clean up type checking in math.scala. + Reynold Xin + 2015-07-04 11:55:20 -0700 + Commit: c991ef5, github.com/apache/spark/pull/7220 + + [SQL] More unit tests for implicit type cast & add simpleString to AbstractDataType. + Reynold Xin + 2015-07-04 11:55:04 -0700 + Commit: 347cab8, github.com/apache/spark/pull/7221 + + Fixed minor style issue with the previous merge. + Reynold Xin + 2015-07-04 01:11:35 -0700 + Commit: 48f7aed + + [SPARK-8270][SQL] levenshtein distance + Tarek Auel + 2015-07-04 01:10:52 -0700 + Commit: 6b3574e, github.com/apache/spark/pull/7214 + + [SPARK-8238][SPARK-8239][SPARK-8242][SPARK-8243][SPARK-8268][SQL]Add ascii/base64/unbase64/encode/decode functions + Cheng Hao + 2015-07-03 23:45:21 -0700 + Commit: f35b0c3, github.com/apache/spark/pull/6843 + + [SPARK-8777] [SQL] Add random data generator test utilities to Spark SQL + Josh Rosen + 2015-07-03 23:05:17 -0700 + Commit: f32487b, github.com/apache/spark/pull/7176 + + [SPARK-8192] [SPARK-8193] [SQL] udf current_date, current_timestamp + Daoyuan Wang + 2015-07-03 22:19:43 -0700 + Commit: 9fb6b83, github.com/apache/spark/pull/6985 + + [SPARK-8572] [SQL] Type coercion for ScalaUDFs + Cheolsoo Park + 2015-07-03 22:14:21 -0700 + Commit: 4a22bce, github.com/apache/spark/pull/7203 + + [SPARK-8810] [SQL] Added several UDF unit tests for Spark SQL + Spiro Michaylov + 2015-07-03 20:15:58 -0700 + Commit: e92c24d, github.com/apache/spark/pull/7207 + + [SPARK-7401] [MLLIB] [PYSPARK] Vectorize dot product and sq_dist between SparseVector and DenseVector + MechCoder + 2015-07-03 15:49:32 -0700 + Commit: f0fac2a, github.com/apache/spark/pull/5946 + + [SPARK-8226] [SQL] Add function shiftrightunsigned + zhichao.li + 2015-07-03 15:39:16 -0700 + Commit: ab535b9, github.com/apache/spark/pull/7035 + + [SPARK-8809][SQL] Remove ConvertNaNs analyzer rule. + Reynold Xin + 2015-07-03 00:25:02 -0700 + Commit: 2848f4d, github.com/apache/spark/pull/7206 + + [SPARK-8803] handle special characters in elements in crosstab + Burak Yavuz + 2015-07-02 22:10:24 -0700 + Commit: 9b23e92, github.com/apache/spark/pull/7201 + + [SPARK-8776] Increase the default MaxPermSize + Yin Huai + 2015-07-02 22:09:07 -0700 + Commit: f743c79, github.com/apache/spark/pull/7196 + + [SPARK-8801][SQL] Support TypeCollection in ExpectsInputTypes + Reynold Xin + 2015-07-02 21:45:25 -0700 + Commit: a59d14f, github.com/apache/spark/pull/7202 + + [SPARK-8501] [SQL] Avoids reading schema from empty ORC files + Cheng Lian + 2015-07-02 21:30:57 -0700 + Commit: 20a4d7d, github.com/apache/spark/pull/7199 + + Minor style fix for the previous commit. + Reynold Xin + 2015-07-02 20:47:04 -0700 + Commit: dfd8bac + + [SPARK-8213][SQL]Add function factorial + zhichao.li + 2015-07-02 20:37:31 -0700 + Commit: 1a7a7d7, github.com/apache/spark/pull/6822 + + [SPARK-6980] [CORE] Akka timeout exceptions indicate which conf controls them (RPC Layer) + Bryan Cutler , Harsh Gupta , BryanCutler + 2015-07-02 21:38:21 -0500 + Commit: aa7bbc1, github.com/apache/spark/pull/6205 + + [SPARK-8782] [SQL] Fix code generation for ORDER BY NULL + Josh Rosen + 2015-07-02 18:07:09 -0700 + Commit: d983819, github.com/apache/spark/pull/7179 + + Revert "[SPARK-8784] [SQL] Add Python API for hex and unhex" + Reynold Xin + 2015-07-02 16:25:10 -0700 + Commit: e589e71 + + [SPARK-7104] [MLLIB] Support model save/load in Python's Word2Vec + Yu ISHIKAWA + 2015-07-02 15:55:16 -0700 + Commit: 488bad3, github.com/apache/spark/pull/6821 + + [SPARK-8784] [SQL] Add Python API for hex and unhex + Davies Liu + 2015-07-02 15:43:02 -0700 + Commit: fc7aebd, github.com/apache/spark/pull/7181 + + [SPARK-3382] [MLLIB] GradientDescent convergence tolerance + lewuathe + 2015-07-02 15:00:13 -0700 + Commit: 7d9cc96, github.com/apache/spark/pull/3636 + + [SPARK-8772][SQL] Implement implicit type cast for expressions that define input types. + Reynold Xin + 2015-07-02 14:16:14 -0700 + Commit: 52508be, github.com/apache/spark/pull/7175 + + [SPARK-7835] Refactor HeartbeatReceiverSuite for coverage + cleanup + Andrew Or + 2015-07-02 13:59:56 -0700 + Commit: cd20355, github.com/apache/spark/pull/7173 + + [SPARK-1564] [DOCS] Added Javascript to Javadocs to create badges for tags like :: Experimental :: + Deron Eriksson + 2015-07-02 13:55:53 -0700 + Commit: fcbcba6, github.com/apache/spark/pull/7169 + + [SPARK-8781] Fix variables in published pom.xml are not resolved + Andrew Or + 2015-07-02 13:49:45 -0700 + Commit: 82cf331, github.com/apache/spark/pull/7193 + + [SPARK-8479] [MLLIB] Add numNonzeros and numActives to linalg.Matrices + MechCoder + 2015-07-02 11:28:14 -0700 + Commit: 34d448d, github.com/apache/spark/pull/6904 + + [SPARK-8581] [SPARK-8584] Simplify checkpointing code + better error message + Andrew Or + 2015-07-02 10:57:02 -0700 + Commit: 2e2f326, github.com/apache/spark/pull/6968 + + [SPARK-8708] [MLLIB] Paritition ALS ratings based on both users and products + Liang-Chi Hsieh , Liang-Chi Hsieh + 2015-07-02 10:18:23 -0700 + Commit: 0e553a3, github.com/apache/spark/pull/7121 + + [SPARK-8407] [SQL] complex type constructors: struct and named_struct + Yijie Shen + 2015-07-02 10:12:25 -0700 + Commit: 52302a8, github.com/apache/spark/pull/6874 + + [SPARK-8747] [SQL] fix EqualNullSafe for binary type + Wenchen Fan + 2015-07-02 10:06:38 -0700 + Commit: afa021e, github.com/apache/spark/pull/7143 + + [SPARK-8223] [SPARK-8224] [SQL] shift left and shift right + Tarek Auel + 2015-07-02 10:02:19 -0700 + Commit: 5b33381, github.com/apache/spark/pull/7178 + + [SPARK-8758] [MLLIB] Add Python user guide for PowerIterationClustering + Yanbo Liang + 2015-07-02 09:59:54 -0700 + Commit: 0a468a4, github.com/apache/spark/pull/7155 + + [SPARK-8647] [MLLIB] Potential issue with constant hashCode + Alok Singh + 2015-07-02 09:58:57 -0700 + Commit: 99c40cd, github.com/apache/spark/pull/7146 + + [SPARK-8690] [SQL] Add a setting to disable SparkSQL parquet schema merge by using datasource API + Wisely Chen + 2015-07-02 09:58:12 -0700 + Commit: 246265f, github.com/apache/spark/pull/7070 + + [SPARK-8746] [SQL] update download link for Hive 0.13.1 + Christian Kadner + 2015-07-02 13:45:19 +0100 + Commit: 1bbdf9e, github.com/apache/spark/pull/7144 + + [SPARK-8787] [SQL] Changed parameter order of @deprecated in package object sql + Vinod K C + 2015-07-02 13:42:48 +0100 + Commit: c572e25, github.com/apache/spark/pull/7183 + + [DOCS] Fix minor wrong lambda expression example. + Kousuke Saruta + 2015-07-02 21:16:35 +0900 + Commit: 4158836, github.com/apache/spark/pull/7187 + + [SPARK-8687] [YARN] Fix bug: Executor can't fetch the new set configuration in yarn-client + huangzhaowei + 2015-07-01 23:14:13 -0700 + Commit: 1b0c8e6, github.com/apache/spark/pull/7066 + + [SPARK-3071] Increase default driver memory + Ilya Ganelin + 2015-07-01 23:11:02 -0700 + Commit: 3697232, github.com/apache/spark/pull/7132 + + [SPARK-8740] [PROJECT INFRA] Support GitHub OAuth tokens in dev/merge_spark_pr.py + Josh Rosen + 2015-07-01 23:06:52 -0700 + Commit: 377ff4c, github.com/apache/spark/pull/7136 + + [SPARK-8769] [TRIVIAL] [DOCS] toLocalIterator should mention it results in many jobs + Holden Karau + 2015-07-01 23:05:45 -0700 + Commit: 15d41cc, github.com/apache/spark/pull/7171 + + [SPARK-8771] [TRIVIAL] Add a version to the deprecated annotation for the actorSystem + Holden Karau + 2015-07-01 23:04:05 -0700 + Commit: d14338e, github.com/apache/spark/pull/7172 + + [SPARK-8688] [YARN] Bug fix: disable the cache fs to gain the HDFS connection. + huangzhaowei + 2015-07-01 23:01:44 -0700 + Commit: 646366b, github.com/apache/spark/pull/7069 + + [SPARK-8754] [YARN] YarnClientSchedulerBackend doesn't stop gracefully in failure conditions + Devaraj K + 2015-07-01 22:59:04 -0700 + Commit: 792fcd8, github.com/apache/spark/pull/7153 + + [SPARK-8227] [SQL] Add function unhex + zhichao.li + 2015-07-01 22:19:51 -0700 + Commit: b285ac5, github.com/apache/spark/pull/7113 + + [SPARK-8660] [MLLIB] removed > symbols from comments in LogisticRegressionSuite.scala for ease of copypaste + Rosstin + 2015-07-01 21:42:06 -0700 + Commit: 4e4f74b, github.com/apache/spark/pull/7167 + + [SPARK-8770][SQL] Create BinaryOperator abstract class. + Reynold Xin + 2015-07-01 21:14:13 -0700 + Commit: 9fd13d5, github.com/apache/spark/pull/7174 + + Revert "[SPARK-8770][SQL] Create BinaryOperator abstract class." + Reynold Xin + 2015-07-01 16:59:39 -0700 + Commit: 3a342de + + [SPARK-8770][SQL] Create BinaryOperator abstract class. + Reynold Xin + 2015-07-01 16:56:48 -0700 + Commit: 2727789, github.com/apache/spark/pull/7170 + + [SPARK-8766] support non-ascii character in column names + Davies Liu + 2015-07-01 16:43:18 -0700 + Commit: f958f27, github.com/apache/spark/pull/7165 + + [SPARK-3444] [CORE] Restore INFO level after log4j test. + Marcelo Vanzin + 2015-07-01 20:40:47 +0100 + Commit: 1ce6428, github.com/apache/spark/pull/7140 + + [QUICKFIX] [SQL] fix copy of generated row + Davies Liu + 2015-07-01 12:39:57 -0700 + Commit: 3083e17, github.com/apache/spark/pull/7163 + + [SPARK-7820] [BUILD] Fix Java8-tests suite compile and test error under sbt + jerryshao + 2015-07-01 12:33:24 -0700 + Commit: 9f7db34, github.com/apache/spark/pull/7120 + + [SPARK-8378] [STREAMING] Add the Python API for Flume + zsxwing + 2015-07-01 11:59:24 -0700 + Commit: 75b9fe4, github.com/apache/spark/pull/6830 + + [SPARK-8765] [MLLIB] [PYTHON] removed flaky python PIC test + Joseph K. Bradley + 2015-07-01 11:57:52 -0700 + Commit: b8faa32, github.com/apache/spark/pull/7164 + + [SPARK-8308] [MLLIB] add missing save load for python example + Yuhao Yang + 2015-07-01 11:17:56 -0700 + Commit: 2012913, github.com/apache/spark/pull/6760 + + [SPARK-6263] [MLLIB] Python MLlib API missing items: Utils + lewuathe + 2015-07-01 11:14:07 -0700 + Commit: 184de91, github.com/apache/spark/pull/5707 + + [SPARK-8621] [SQL] support empty string as column name + Wenchen Fan + 2015-07-01 10:31:35 -0700 + Commit: 31b4a3d, github.com/apache/spark/pull/7149 + + [SPARK-8752][SQL] Add ExpectsInputTypes trait for defining expected input types. + Reynold Xin + 2015-07-01 10:30:54 -0700 + Commit: 4137f76, github.com/apache/spark/pull/7151 + + [SPARK-7714] [SPARKR] SparkR tests should use more specific expectations than expect_true + Sun Rui + 2015-07-01 09:50:12 -0700 + Commit: 69c5dee, github.com/apache/spark/pull/7152 + + [SPARK-8763] [PYSPARK] executing run-tests.py with Python 2.6 fails with absence of subprocess.check_output function + cocoatomo + 2015-07-01 09:37:09 -0700 + Commit: fdcad6e, github.com/apache/spark/pull/7161 + + [SPARK-8750][SQL] Remove the closure in functions.callUdf. + Reynold Xin + 2015-07-01 01:08:20 -0700 + Commit: 9765241, github.com/apache/spark/pull/7148 + + [SQL] [MINOR] remove internalRowRDD in DataFrame + Wenchen Fan + 2015-07-01 01:02:33 -0700 + Commit: 0eee061, github.com/apache/spark/pull/7116 + + [SPARK-8749][SQL] Remove HiveTypeCoercion trait. + Reynold Xin + 2015-07-01 00:08:16 -0700 + Commit: fc3a6fe, github.com/apache/spark/pull/7147 + + [SPARK-8748][SQL] Move castability test out from Cast case class into Cast object. + Reynold Xin + 2015-06-30 23:04:54 -0700 + Commit: 365c140, github.com/apache/spark/pull/7145 + + [SPARK-6602][Core]Remove unnecessary synchronized + zsxwing + 2015-06-30 21:57:07 -0700 + Commit: 64c1461, github.com/apache/spark/pull/7141 + + [SPARK-8535] [PYSPARK] PySpark : Can't create DataFrame from Pandas dataframe with no explicit column name + x1- + 2015-06-30 20:35:46 -0700 + Commit: b6e76ed, github.com/apache/spark/pull/7124 + + [SPARK-8471] [ML] Rename DiscreteCosineTransformer to DCT + Feynman Liang + 2015-06-30 20:19:43 -0700 + Commit: f457569, github.com/apache/spark/pull/7138 + + [SPARK-6602][Core] Update Master, Worker, Client, AppClient and related classes to use RpcEndpoint + zsxwing + 2015-06-30 17:39:55 -0700 + Commit: 3bee0f1, github.com/apache/spark/pull/5392 + + [SPARK-8727] [SQL] Missing python api; md5, log2 + Tarek Auel , Tarek Auel + 2015-06-30 16:59:44 -0700 + Commit: ccdb052, github.com/apache/spark/pull/7114 + + [SPARK-8741] [SQL] Remove e and pi from DataFrame functions. + Reynold Xin + 2015-06-30 16:54:51 -0700 + Commit: 8133125, github.com/apache/spark/pull/7137 + + [SPARK-7739] [MLLIB] Improve ChiSqSelector example code in user guide + sethah + 2015-06-30 16:28:25 -0700 + Commit: 8d23587, github.com/apache/spark/pull/7029 + + [SPARK-8738] [SQL] [PYSPARK] capture SQL AnalysisException in Python API + Davies Liu + 2015-06-30 16:17:46 -0700 + Commit: 58ee2a2, github.com/apache/spark/pull/7135 + + [SPARK-8739] [WEB UI] [WINDOWS] A illegal character `\r` can be contained in StagePage. + Kousuke Saruta + 2015-06-30 14:09:29 -0700 + Commit: d2495f7, github.com/apache/spark/pull/7133 + + [SPARK-8563] [MLLIB] Fixed a bug so that IndexedRowMatrix.computeSVD().U.numCols = k + lee19 + 2015-06-30 14:08:00 -0700 + Commit: e725262, github.com/apache/spark/pull/6953 + + [SPARK-8705] [WEBUI] Don't display rects when totalExecutionTime is 0 + zsxwing + 2015-06-30 14:06:50 -0700 + Commit: 8c89896, github.com/apache/spark/pull/7088 + + [SPARK-8736] [ML] GBTRegressor should not threshold prediction + Joseph K. Bradley + 2015-06-30 14:02:50 -0700 + Commit: 3ba23ff, github.com/apache/spark/pull/7134 + + [SPARK-8372] Do not show applications that haven't recorded their app ID yet. + Marcelo Vanzin , Carson Wang + 2015-06-30 14:01:52 -0700 + Commit: 4bb8375, github.com/apache/spark/pull/7097 + + [SPARK-2645] [CORE] Allow SparkEnv.stop() to be called multiple times without side effects. + Joshi , Rekha Joshi + 2015-06-30 14:00:35 -0700 + Commit: 7dda084, github.com/apache/spark/pull/6973 + + [SPARK-8560] [UI] The Executors page will have negative if having resubmitted tasks + xutingjun + 2015-06-30 13:56:59 -0700 + Commit: 79f0b37, github.com/apache/spark/pull/6950 + + [SPARK-7514] [MLLIB] Add MinMaxScaler to feature transformation + Yuhao Yang + 2015-06-30 12:44:43 -0700 + Commit: 61d7b53, github.com/apache/spark/pull/6039 + + [SPARK-8471] [ML] Discrete Cosine Transform Feature Transformer + Feynman Liang + 2015-06-30 12:31:33 -0700 + Commit: 74cc16d, github.com/apache/spark/pull/6894 + + [SPARK-8628] [SQL] Race condition in AbstractSparkSQLParser.parse + Vinod K C + 2015-06-30 12:24:47 -0700 + Commit: b8e5bb6, github.com/apache/spark/pull/7015 + + [SPARK-8664] [ML] Add PCA transformer + Yanbo Liang + 2015-06-30 12:23:48 -0700 + Commit: c1befd7, github.com/apache/spark/pull/7065 + + [SPARK-6785] [SQL] fix DateTimeUtils for dates before 1970 + Christian Kadner + 2015-06-30 12:22:34 -0700 + Commit: 1e1f339, github.com/apache/spark/pull/6983 + + [SPARK-8619] [STREAMING] Don't recover keytab and principal configuration within Streaming checkpoint + huangzhaowei + 2015-06-30 11:46:22 -0700 + Commit: d16a944, github.com/apache/spark/pull/7008 + + [SPARK-8630] [STREAMING] Prevent from checkpointing QueueInputDStream + zsxwing + 2015-06-30 11:14:38 -0700 + Commit: 5726440, github.com/apache/spark/pull/7016 + + [SPARK-7988] [STREAMING] Round-robin scheduling of receivers by default + nishkamravi2 , Nishkam Ravi + 2015-06-30 11:12:15 -0700 + Commit: ca7e460, github.com/apache/spark/pull/6607 + + [SPARK-8615] [DOCUMENTATION] Fixed Sample deprecated code + Tijo Thomas + 2015-06-30 10:50:45 -0700 + Commit: 9213f73, github.com/apache/spark/pull/7039 + + [SPARK-8713] Make codegen thread safe + Davies Liu + 2015-06-30 10:48:49 -0700 + Commit: fbb267e, github.com/apache/spark/pull/7101 + + [SPARK-8679] [PYSPARK] [MLLIB] Default values in Pipeline API should be immutable + MechCoder + 2015-06-30 10:27:29 -0700 + Commit: 5fa0863, github.com/apache/spark/pull/7058 + + [SPARK-4127] [MLLIB] [PYSPARK] Python bindings for StreamingLinearRegressionWithSGD + MechCoder + 2015-06-30 10:25:59 -0700 + Commit: 4528166, github.com/apache/spark/pull/6744 + + [SPARK-8437] [DOCS] Corrected: Using directory path without wildcard for filename slow for large number of files with wholeTextFiles and binaryFiles + Sean Owen + 2015-06-30 10:07:26 -0700 + Commit: ada384b, github.com/apache/spark/pull/7126 + + [SPARK-8592] [CORE] CoarseGrainedExecutorBackend: Cannot register with driver => NPE + xuchenCN + 2015-06-30 10:05:51 -0700 + Commit: 689da28, github.com/apache/spark/pull/7110 + + [SPARK-8236] [SQL] misc functions: crc32 + Shilei + 2015-06-30 09:49:58 -0700 + Commit: 722aa5f, github.com/apache/spark/pull/7108 + + [SPARK-8680] [SQL] Slightly improve PropagateTypes + Liang-Chi Hsieh + 2015-06-30 08:17:24 -0700 + Commit: a48e619, github.com/apache/spark/pull/7087 + + [SPARK-8723] [SQL] improve divide and remainder code gen + Wenchen Fan + 2015-06-30 08:08:15 -0700 + Commit: 865a834, github.com/apache/spark/pull/7111 + + [SPARK-8590] [SQL] add code gen for ExtractValue + Wenchen Fan + 2015-06-30 07:58:49 -0700 + Commit: 08fab48, github.com/apache/spark/pull/6982 + + [SPARK-7756] [CORE] More robust SSL options processing. + Tim Ellison + 2015-06-30 13:49:52 +0100 + Commit: 2ed0c0a, github.com/apache/spark/pull/7043 + + [SPARK-8551] [ML] Elastic net python code example + Shuo Xiang + 2015-06-29 23:50:34 -0700 + Commit: 5452457, github.com/apache/spark/pull/6946 + + [SPARK-8434][SQL]Add a "pretty" parameter to the "show" method to display long strings + zsxwing + 2015-06-29 23:44:11 -0700 + Commit: 12671dd, github.com/apache/spark/pull/6877 + + [SPARK-5161] [HOTFIX] Fix bug in Python test failure reporting + Josh Rosen + 2015-06-29 23:08:51 -0700 + Commit: 6c5a6db, github.com/apache/spark/pull/7112 + + [SPARK-8650] [SQL] Use the user-specified app name priority in SparkSQLCLIDriver or HiveThriftServer2 + Yadong Qi + 2015-06-29 22:34:38 -0700 + Commit: e6c3f74, github.com/apache/spark/pull/7030 + + [SPARK-8721][SQL] Rename ExpectsInputTypes => AutoCastInputTypes. + Reynold Xin + 2015-06-29 22:32:43 -0700 + Commit: f79410c, github.com/apache/spark/pull/7109 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-06-29 21:41:59 -0700 + Commit: ea775b0, github.com/apache/spark/pull/1767 + + [SPARK-5161] Parallelize Python test execution + Josh Rosen + 2015-06-29 21:32:40 -0700 + Commit: 7bbbe38, github.com/apache/spark/pull/7031 + + [SPARK-7667] [MLLIB] MLlib Python API consistency check + Yanbo Liang + 2015-06-29 18:50:23 -0700 + Commit: f9b6bf2, github.com/apache/spark/pull/6856 + + [SPARK-8669] [SQL] Fix crash with BINARY (ENUM) fields with Parquet 1.7 + Steven She + 2015-06-29 18:50:09 -0700 + Commit: 4915e9e, github.com/apache/spark/pull/7048 + + [SPARK-8715] ArrayOutOfBoundsException fixed for DataFrameStatSuite.crosstab + Burak Yavuz + 2015-06-29 18:48:28 -0700 + Commit: ecacb1e, github.com/apache/spark/pull/7100 + + [SPARK-8456] [ML] Ngram featurizer python + Feynman Liang + 2015-06-29 18:40:30 -0700 + Commit: 620605a, github.com/apache/spark/pull/6960 + + Revert "[SPARK-8437] [DOCS] Using directory path without wildcard for filename slow for large number of files with wholeTextFiles and binaryFiles" + Andrew Or + 2015-06-29 18:32:31 -0700 + Commit: 4c1808b + + [SPARK-8019] [SPARKR] Support SparkR spawning worker R processes with a command other then Rscript + Michael Sannella x268 + 2015-06-29 17:28:28 -0700 + Commit: 4a9e03f, github.com/apache/spark/pull/6557 + + [SPARK-8410] [SPARK-8475] remove previous ivy resolution when using spark-submit + Burak Yavuz + 2015-06-29 17:27:02 -0700 + Commit: d7f796d, github.com/apache/spark/pull/7089 + + [SPARK-8437] [DOCS] Using directory path without wildcard for filename slow for large number of files with wholeTextFiles and binaryFiles + Sean Owen + 2015-06-29 17:21:35 -0700 + Commit: 5d30eae, github.com/apache/spark/pull/7036 + + [SPARK-7287] [SPARK-8567] [TEST] Add sc.stop to applications in SparkSubmitSuite + Yin Huai + 2015-06-29 17:20:05 -0700 + Commit: fbf7573, github.com/apache/spark/pull/7027 + + [SPARK-8634] [STREAMING] [TESTS] Fix flaky test StreamingListenerSuite "receiver info reporting" + zsxwing + 2015-06-29 17:19:05 -0700 + Commit: cec9852, github.com/apache/spark/pull/7017 + + [SPARK-8589] [SQL] cleanup DateTimeUtils + Wenchen Fan + 2015-06-29 16:34:50 -0700 + Commit: 881662e, github.com/apache/spark/pull/6980 + + [SPARK-8710] [SQL] Change ScalaReflection.mirror from a val to a def. + Yin Huai + 2015-06-29 16:26:05 -0700 + Commit: 4b497a7, github.com/apache/spark/pull/7094 + + [SPARK-8661][ML] for LinearRegressionSuite.scala, changed javadoc-style comments to regular multiline comments, to make copy-pasting R code more simple + Rosstin + 2015-06-29 16:09:29 -0700 + Commit: 4e880cf, github.com/apache/spark/pull/7098 + + [SPARK-8579] [SQL] support arbitrary object in UnsafeRow + Davies Liu + 2015-06-29 15:59:20 -0700 + Commit: ed359de, github.com/apache/spark/pull/6959 + + [SPARK-8478] [SQL] Harmonize UDF-related code to use uniformly UDF instead of Udf + BenFradet + 2015-06-29 15:27:13 -0700 + Commit: 931da5c, github.com/apache/spark/pull/6920 + + [SPARK-8660][ML] Convert JavaDoc style comments inLogisticRegressionSuite.scala to regular multiline comments, to make copy-pasting R commands easier + Rosstin + 2015-06-29 14:45:08 -0700 + Commit: c8ae887, github.com/apache/spark/pull/7096 + + [SPARK-7810] [PYSPARK] solve python rdd socket connection problem + Ai He , AiHe + 2015-06-29 14:36:26 -0700 + Commit: ecd3aac, github.com/apache/spark/pull/6338 + + [SPARK-8056][SQL] Design an easier way to construct schema for both Scala and Python + Ilya Ganelin + 2015-06-29 14:15:15 -0700 + Commit: f6fc254, github.com/apache/spark/pull/6686 + + [SPARK-8709] Exclude hadoop-client's mockito-all dependency + Josh Rosen + 2015-06-29 14:07:55 -0700 + Commit: 27ef854, github.com/apache/spark/pull/7090 + + [SPARK-8070] [SQL] [PYSPARK] avoid spark jobs in createDataFrame + Davies Liu + 2015-06-29 13:20:55 -0700 + Commit: afae976, github.com/apache/spark/pull/6606 + + [SPARK-8681] fixed wrong ordering of columns in crosstab + Burak Yavuz + 2015-06-29 13:15:04 -0700 + Commit: be7ef06, github.com/apache/spark/pull/7060 + + [SPARK-7862] [SQL] Disable the error message redirect to stderr + Cheng Hao + 2015-06-29 12:46:33 -0700 + Commit: c6ba2ea, github.com/apache/spark/pull/6882 + + [SPARK-8214] [SQL] Add function hex + zhichao.li + 2015-06-29 12:25:16 -0700 + Commit: 637b4ee, github.com/apache/spark/pull/6976 + + [SQL][DOCS] Remove wrong example from DataFrame.scala + Kousuke Saruta + 2015-06-29 12:16:12 -0700 + Commit: 94e040d, github.com/apache/spark/pull/6977 + + [SPARK-8528] Expose SparkContext.applicationId in PySpark + Vladimir Vladimirov + 2015-06-29 12:03:41 -0700 + Commit: 492dca3, github.com/apache/spark/pull/6936 + + [SPARK-8235] [SQL] misc function sha / sha1 + Tarek Auel , Tarek Auel + 2015-06-29 11:57:19 -0700 + Commit: a5c2961, github.com/apache/spark/pull/6963 + + [SPARK-8066, SPARK-8067] [hive] Add support for Hive 1.0, 1.1 and 1.2. + Marcelo Vanzin + 2015-06-29 11:53:17 -0700 + Commit: 3664ee2, github.com/apache/spark/pull/7026 + + [SPARK-8692] [SQL] re-order the case statements that handling catalyst data types + Wenchen Fan + 2015-06-29 11:41:26 -0700 + Commit: ed413bc, github.com/apache/spark/pull/7073 + + Revert "[SPARK-8372] History server shows incorrect information for application not started" + Andrew Or + 2015-06-29 10:52:05 -0700 + Commit: ea88b1a + + [SPARK-8554] Add the SparkR document files to `.rat-excludes` for `./dev/check-license` + Yu ISHIKAWA + 2015-06-29 09:22:55 -0700 + Commit: 715f084, github.com/apache/spark/pull/6947 + + [SPARK-8693] [PROJECT INFRA] profiles and goals are not printed in a nice way + Brennon York + 2015-06-29 08:55:06 -0700 + Commit: 5c796d5, github.com/apache/spark/pull/7085 + + [SPARK-8702] [WEBUI] Avoid massive concating strings in Javascript + zsxwing + 2015-06-30 00:46:55 +0900 + Commit: 630bd5f, github.com/apache/spark/pull/7082 + + [SPARK-8698] partitionBy in Python DataFrame reader/writer interface should not default to empty tuple. + Reynold Xin + 2015-06-29 00:22:44 -0700 + Commit: 660c6ce, github.com/apache/spark/pull/7079 + + [SPARK-8355] [SQL] Python DataFrameReader/Writer should mirror Scala + Cheolsoo Park + 2015-06-29 00:13:39 -0700 + Commit: ac2e17b, github.com/apache/spark/pull/7078 + + [SPARK-8575] [SQL] Deprecate callUDF in favor of udf + BenFradet + 2015-06-28 22:43:47 -0700 + Commit: 0b10662, github.com/apache/spark/pull/6993 + + [SPARK-5962] [MLLIB] Python support for Power Iteration Clustering + Yanbo Liang + 2015-06-28 22:38:04 -0700 + Commit: dfde31d, github.com/apache/spark/pull/6992 + + [SPARK-7212] [MLLIB] Add sequence learning flag + Feynman Liang + 2015-06-28 22:26:07 -0700 + Commit: 25f574e, github.com/apache/spark/pull/6997 + + [SPARK-7845] [BUILD] Bumping default Hadoop version used in profile hadoop-1 to 1.2.1 + Cheng Lian + 2015-06-28 19:34:59 -0700 + Commit: 00a9d22, github.com/apache/spark/pull/7062 + + [SPARK-8677] [SQL] Fix non-terminating decimal expansion for decimal divide operation + Liang-Chi Hsieh + 2015-06-28 14:48:44 -0700 + Commit: 24fda73, github.com/apache/spark/pull/7056 + + [SPARK-8596] [EC2] Added port for Rstudio + Vincent D. Warmerdam , vincent + 2015-06-28 13:33:33 -0700 + Commit: 9ce78b4, github.com/apache/spark/pull/7068 + + [SPARK-8686] [SQL] DataFrame should support `where` with expression represented by String + Kousuke Saruta + 2015-06-28 08:29:07 -0700 + Commit: ec78438, github.com/apache/spark/pull/7063 + + [SPARK-8610] [SQL] Separate Row and InternalRow (part 2) + Davies Liu + 2015-06-28 08:03:58 -0700 + Commit: 77da5be, github.com/apache/spark/pull/7003 + + [SPARK-8649] [BUILD] Mapr repository is not defined properly + Thomas Szymanski + 2015-06-28 01:06:49 -0700 + Commit: 52d1281, github.com/apache/spark/pull/7054 + + [SPARK-8683] [BUILD] Depend on mockito-core instead of mockito-all + Josh Rosen + 2015-06-27 23:27:52 -0700 + Commit: f510045, github.com/apache/spark/pull/7061 + + [HOTFIX] Fix pull request builder bug in #6967 + Josh Rosen + 2015-06-27 23:07:20 -0700 + Commit: 42db3a1 + + [SPARK-8583] [SPARK-5482] [BUILD] Refactor python/run-tests to integrate with dev/run-tests module system + Josh Rosen + 2015-06-27 20:24:34 -0700 + Commit: 40648c5, github.com/apache/spark/pull/6967 + + [SPARK-8606] Prevent exceptions in RDD.getPreferredLocations() from crashing DAGScheduler + Josh Rosen + 2015-06-27 14:40:45 -0700 + Commit: 0b5abbf, github.com/apache/spark/pull/7023 + + [SPARK-8623] Hadoop RDDs fail to properly serialize configuration + Sandy Ryza + 2015-06-27 14:33:31 -0700 + Commit: 4153776, github.com/apache/spark/pull/7050 + + [SPARK-3629] [YARN] [DOCS]: Improvement of the "Running Spark on YARN" document + Neelesh Srinivas Salian + 2015-06-27 09:07:10 +0300 + Commit: d48e789, github.com/apache/spark/pull/6924 + + [SPARK-8639] [DOCS] Fixed Minor Typos in Documentation + Rosstin + 2015-06-27 08:47:00 +0300 + Commit: b5a6663, github.com/apache/spark/pull/7046 + + [SPARK-8607] SparkR -- jars not being added to application classpath correctly + cafreeman + 2015-06-26 17:06:02 -0700 + Commit: 9d11817, github.com/apache/spark/pull/7001 + + [SPARK-8662] SparkR Update SparkSQL Test + cafreeman + 2015-06-26 10:07:35 -0700 + Commit: a56516f, github.com/apache/spark/pull/7045 + + [SPARK-8652] [PYSPARK] Check return value for all uses of doctest.testmod() + Josh Rosen + 2015-06-26 08:12:22 -0700 + Commit: 41afa16, github.com/apache/spark/pull/7032 + + [SPARK-8302] Support heterogeneous cluster install paths on YARN. + Marcelo Vanzin + 2015-06-26 08:45:22 -0500 + Commit: 37bf76a, github.com/apache/spark/pull/6752 + + [SPARK-8613] [ML] [TRIVIAL] add param to disable linear feature scaling + Holden Karau + 2015-06-26 01:19:05 -0700 + Commit: c9e05a3, github.com/apache/spark/pull/7024 + + [SPARK-8344] Add message processing time metric to DAGScheduler + Josh Rosen + 2015-06-26 00:12:05 -0700 + Commit: 9fed6ab, github.com/apache/spark/pull/7002 + + [SPARK-8635] [SQL] improve performance of CatalystTypeConverters + Wenchen Fan + 2015-06-25 22:44:26 -0700 + Commit: 1a79f0e, github.com/apache/spark/pull/7018 + + [SPARK-8620] [SQL] cleanup CodeGenContext + Wenchen Fan + 2015-06-25 22:16:53 -0700 + Commit: 4036011, github.com/apache/spark/pull/7010 + + [SPARK-8237] [SQL] Add misc function sha2 + Liang-Chi Hsieh + 2015-06-25 22:07:37 -0700 + Commit: 47c874b, github.com/apache/spark/pull/6934 + + [SPARK-8637] [SPARKR] [HOTFIX] Fix packages argument, sparkSubmitBinName + Shivaram Venkataraman + 2015-06-25 10:56:00 -0700 + Commit: c392a9e, github.com/apache/spark/pull/7022 + + [MINOR] [MLLIB] rename some functions of PythonMLLibAPI + Yanbo Liang + 2015-06-25 08:13:17 -0700 + Commit: 2519dcc, github.com/apache/spark/pull/7011 + + [SPARK-8567] [SQL] Add logs to record the progress of HiveSparkSubmitSuite. + Yin Huai + 2015-06-25 06:52:03 -0700 + Commit: f9b397f, github.com/apache/spark/pull/7009 + + [SPARK-8574] org/apache/spark/unsafe doesn't honor the java source/ta… + Tom Graves , Thomas Graves + 2015-06-25 08:27:08 -0500 + Commit: e988adb, github.com/apache/spark/pull/6989 + + [SPARK-5768] [WEB UI] Fix for incorrect memory in Spark UI + Joshi , Rekha Joshi + 2015-06-25 20:21:34 +0900 + Commit: 085a721, github.com/apache/spark/pull/6972 + + [SPARK-8604] [SQL] HadoopFsRelation subclasses should set their output format class + Cheng Lian + 2015-06-25 00:06:23 -0700 + Commit: c337844, github.com/apache/spark/pull/6998 + + [SPARK-7884] Move block deserialization from BlockStoreShuffleFetcher to ShuffleReader + Matt Massie , Kay Ousterhout + 2015-06-24 22:09:31 -0700 + Commit: 7bac2fe, github.com/apache/spark/pull/6423 + + Two minor SQL cleanup (compiler warning & indent). + Reynold Xin + 2015-06-24 19:34:07 -0700 + Commit: 82f80c1, github.com/apache/spark/pull/7000 + + [SPARK-8075] [SQL] apply type check interface to more expressions + Wenchen Fan + 2015-06-24 16:26:00 -0700 + Commit: b71d325, github.com/apache/spark/pull/6723 + + [SPARK-8567] [SQL] Increase the timeout of HiveSparkSubmitSuite + Yin Huai + 2015-06-24 15:52:58 -0700 + Commit: 7daa702, github.com/apache/spark/pull/6957 + + [SPARK-8558] [BUILD] Script /dev/run-tests fails when _JAVA_OPTIONS env var set + fe2s , Oleksiy Dyagilev + 2015-06-24 15:12:23 -0700 + Commit: dca21a8, github.com/apache/spark/pull/6956 + + [SPARK-6777] [SQL] Implements backwards compatibility rules in CatalystSchemaConverter + Cheng Lian + 2015-06-24 15:03:43 -0700 + Commit: 8ab5076, github.com/apache/spark/pull/6617 + + [SPARK-7633] [MLLIB] [PYSPARK] Python bindings for StreamingLogisticRegressionwithSGD + MechCoder + 2015-06-24 14:58:43 -0700 + Commit: fb32c38, github.com/apache/spark/pull/6849 + + [SPARK-7289] handle project -> limit -> sort efficiently + Wenchen Fan + 2015-06-24 13:28:50 -0700 + Commit: f04b567, github.com/apache/spark/pull/6780 + + [SPARK-7088] [SQL] Fix analysis for 3rd party logical plan. + Santiago M. Mola + 2015-06-24 12:29:07 -0700 + Commit: b84d4b4, github.com/apache/spark/pull/6853 + + [SPARK-8506] Add pakages to R context created through init. + Holden Karau + 2015-06-24 11:55:20 -0700 + Commit: 43e6619, github.com/apache/spark/pull/6928 + + [SPARK-8399] [STREAMING] [WEB UI] Overlap between histograms and axis' name in Spark Streaming UI + BenFradet + 2015-06-24 11:53:03 -0700 + Commit: 1173483, github.com/apache/spark/pull/6845 + + [SPARK-8576] Add spark-ec2 options to set IAM roles and instance-initiated shutdown behavior + Nicholas Chammas + 2015-06-24 11:20:51 -0700 + Commit: 31f48e5, github.com/apache/spark/pull/6962 + + [SPARK-8578] [SQL] Should ignore user defined output committer when appending data + Yin Huai + 2015-06-24 09:50:03 -0700 + Commit: bba6699, github.com/apache/spark/pull/6964 + + [SPARK-8567] [SQL] Debugging flaky HiveSparkSubmitSuite + Cheng Lian + 2015-06-24 09:49:20 -0700 + Commit: 9d36ec2, github.com/apache/spark/pull/6978 + + [SPARK-8138] [SQL] Improves error message when conflicting partition columns are found + Cheng Lian + 2015-06-24 02:17:12 -0700 + Commit: cc465fd, github.com/apache/spark/pull/6610 + + [SPARK-8371] [SQL] improve unit test for MaxOf and MinOf and fix bugs + Wenchen Fan + 2015-06-23 23:11:42 -0700 + Commit: 09fcf96, github.com/apache/spark/pull/6825 + + [HOTFIX] [BUILD] Fix MiMa checks in master branch; enable MiMa for launcher project + Josh Rosen + 2015-06-23 23:03:59 -0700 + Commit: 13ae806, github.com/apache/spark/pull/6974 + + [SPARK-6749] [SQL] Make metastore client robust to underlying socket connection loss + Eric Liang + 2015-06-23 22:27:17 -0700 + Commit: 50c3a86, github.com/apache/spark/pull/6912 + + Revert "[SPARK-7157][SQL] add sampleBy to DataFrame" + Reynold Xin + 2015-06-23 19:30:25 -0700 + Commit: a458efc + + [SPARK-7157][SQL] add sampleBy to DataFrame + Xiangrui Meng + 2015-06-23 17:46:29 -0700 + Commit: 0401cba, github.com/apache/spark/pull/6769 + + [SPARK-8139] [SQL] Updates docs and comments of data sources and Parquet output committer options + Cheng Lian + 2015-06-23 17:24:26 -0700 + Commit: 111d6b9, github.com/apache/spark/pull/6683 + + [SPARK-8573] [SPARK-8568] [SQL] [PYSPARK] raise Exception if column is used in booelan expression + Davies Liu + 2015-06-23 15:51:16 -0700 + Commit: 7fb5ae5, github.com/apache/spark/pull/6961 + + [DOC] [SQL] Addes Hive metastore Parquet table conversion section + Cheng Lian + 2015-06-23 14:19:21 -0700 + Commit: d96d7b5, github.com/apache/spark/pull/5348 + + [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector + Oleksiy Dyagilev + 2015-06-23 13:12:19 -0700 + Commit: a803118, github.com/apache/spark/pull/6954 + + [SPARK-8111] [SPARKR] SparkR shell should display Spark logo and version banner on startup. + Alok Singh , Alok Singh + 2015-06-23 12:47:55 -0700 + Commit: f2fb028, github.com/apache/spark/pull/6944 + + [SPARK-8265] [MLLIB] [PYSPARK] Add LinearDataGenerator to pyspark.mllib.utils + MechCoder + 2015-06-23 12:43:32 -0700 + Commit: f2022fa, github.com/apache/spark/pull/6715 + + [SPARK-7888] Be able to disable intercept in linear regression in ml package + Holden Karau + 2015-06-23 12:42:17 -0700 + Commit: 2b1111d, github.com/apache/spark/pull/6927 + + [SPARK-8432] [SQL] fix hashCode() and equals() of BinaryType in Row + Davies Liu + 2015-06-23 11:55:47 -0700 + Commit: 6f4cadf, github.com/apache/spark/pull/6876 + + [SPARK-7235] [SQL] Refactor the grouping sets + Cheng Hao + 2015-06-23 10:52:17 -0700 + Commit: 7b1450b, github.com/apache/spark/pull/5780 + + [SQL] [DOCS] updated the documentation for explode + lockwobr + 2015-06-24 02:48:56 +0900 + Commit: 4f7fbef, github.com/apache/spark/pull/6943 + + [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer + Holden Karau + 2015-06-23 09:08:11 -0700 + Commit: 0f92be5, github.com/apache/spark/pull/6918 + + [SPARK-8300] DataFrame hint for broadcast join. + Reynold Xin + 2015-06-23 01:50:31 -0700 + Commit: 6ceb169, github.com/apache/spark/pull/6751 + + [SPARK-8541] [PYSPARK] test the absolute error in approx doctests + Scott Taylor + 2015-06-22 23:37:56 -0700 + Commit: f0dcbe8, github.com/apache/spark/pull/6942 + + [SPARK-8483] [STREAMING] Remove commons-lang3 dependency from Flume Si… + Hari Shreedharan + 2015-06-22 23:34:17 -0700 + Commit: 9b618fb, github.com/apache/spark/pull/6910 + + [SPARK-8359] [SQL] Fix incorrect decimal precision after multiplication + Liang-Chi Hsieh + 2015-06-22 23:11:56 -0700 + Commit: 31bd306, github.com/apache/spark/pull/6814 + + [SPARK-8431] [SPARKR] Add in operator to DataFrame Column in SparkR + Yu ISHIKAWA + 2015-06-22 23:04:36 -0700 + Commit: d4f6335, github.com/apache/spark/pull/6941 + + [SPARK-7781] [MLLIB] gradient boosted trees.train regressor missing max bins + Holden Karau + 2015-06-22 22:40:19 -0700 + Commit: 164fe2a, github.com/apache/spark/pull/6331 + + [SPARK-8548] [SPARKR] Remove the trailing whitespaces from the SparkR files + Yu ISHIKAWA + 2015-06-22 20:55:38 -0700 + Commit: 44fa7df, github.com/apache/spark/pull/6945 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-06-22 20:25:32 -0700 + Commit: c4d2343, github.com/apache/spark/pull/2849 + + [SPARK-7859] [SQL] Collect_set() behavior differences which fails the unit test under jdk8 + Cheng Hao + 2015-06-22 20:04:49 -0700 + Commit: 13321e6, github.com/apache/spark/pull/6402 + + [SPARK-8307] [SQL] improve timestamp from parquet + Davies Liu + 2015-06-22 18:03:59 -0700 + Commit: 6b7f2ce, github.com/apache/spark/pull/6759 + + [SPARK-7153] [SQL] support all integral type ordinal in GetArrayItem + Wenchen Fan + 2015-06-22 17:37:35 -0700 + Commit: 860a49e, github.com/apache/spark/pull/5706 + + [HOTFIX] [TESTS] Typo mqqt -> mqtt + Andrew Or + 2015-06-22 16:16:26 -0700 + Commit: 1dfb0f7 + + [SPARK-8492] [SQL] support binaryType in UnsafeRow + Davies Liu + 2015-06-22 15:22:17 -0700 + Commit: 96aa013, github.com/apache/spark/pull/6911 + + [SPARK-8356] [SQL] Reconcile callUDF and callUdf + BenFradet + 2015-06-22 15:06:47 -0700 + Commit: 50d3242, github.com/apache/spark/pull/6902 + + [SPARK-8537] [SPARKR] Add a validation rule about the curly braces in SparkR to `.lintr` + Yu ISHIKAWA + 2015-06-22 14:35:38 -0700 + Commit: b1f3a48, github.com/apache/spark/pull/6940 + + [SPARK-8455] [ML] Implement n-gram feature transformer + Feynman Liang + 2015-06-22 14:15:35 -0700 + Commit: afe35f0, github.com/apache/spark/pull/6887 + + [SPARK-8532] [SQL] In Python's DataFrameWriter, save/saveAsTable/json/parquet/jdbc always override mode + Yin Huai + 2015-06-22 13:51:23 -0700 + Commit: 5ab9fcf, github.com/apache/spark/pull/6937 + + [SPARK-8104] [SQL] auto alias expressions in analyzer + Wenchen Fan + 2015-06-22 12:13:00 -0700 + Commit: da7bbb9, github.com/apache/spark/pull/6647 + + [SPARK-8511] [PYSPARK] Modify a test to remove a saved model in `regression.py` + Yu ISHIKAWA + 2015-06-22 11:53:11 -0700 + Commit: 5d89d9f, github.com/apache/spark/pull/6926 + + [SPARK-8482] Added M4 instances to the list. + Pradeep Chhetri + 2015-06-22 11:45:31 -0700 + Commit: ba8a453, github.com/apache/spark/pull/6899 + + [SPARK-8429] [EC2] Add ability to set additional tags + Stefano Parmesan + 2015-06-22 11:43:10 -0700 + Commit: 42a1f71, github.com/apache/spark/pull/6857 + + [SPARK-8406] [SQL] Adding UUID to output file name to avoid accidental overwriting + Cheng Lian + 2015-06-22 10:03:57 -0700 + Commit: 0818fde, github.com/apache/spark/pull/6864 + + [SPARK-7426] [MLLIB] [ML] Updated Attribute.fromStructField to allow any NumericType. + Mike Dusenberry + 2015-06-21 18:25:36 -0700 + Commit: 47c1d56, github.com/apache/spark/pull/6540 + + [SPARK-7715] [MLLIB] [ML] [DOC] Updated MLlib programming guide for release 1.4 + Joseph K. Bradley + 2015-06-21 16:25:25 -0700 + Commit: a189442, github.com/apache/spark/pull/6897 + + [SPARK-8508] [SQL] Ignores a test case to cleanup unnecessary testing output until #6882 is merged + Cheng Lian + 2015-06-21 13:20:28 -0700 + Commit: 83cdfd8, github.com/apache/spark/pull/6925 + + [SPARK-7604] [MLLIB] Python API for PCA and PCAModel + Yanbo Liang + 2015-06-21 12:04:20 -0700 + Commit: 32e3cda, github.com/apache/spark/pull/6315 + + [SPARK-8379] [SQL] avoid speculative tasks write to the same file + jeanlyn + 2015-06-21 00:13:40 -0700 + Commit: a1e3649, github.com/apache/spark/pull/6833 + + [SPARK-8301] [SQL] Improve UTF8String substring/startsWith/endsWith/contains performance + Tarek Auel , Tarek Auel + 2015-06-20 20:03:59 -0700 + Commit: 41ab285, github.com/apache/spark/pull/6804 + + [SPARK-8495] [SPARKR] Add a `.lintr` file to validate the SparkR files and the `lint-r` script + Yu ISHIKAWA + 2015-06-20 16:10:14 -0700 + Commit: 004f573, github.com/apache/spark/pull/6922 + + [SPARK-8422] [BUILD] [PROJECT INFRA] Add a module abstraction to dev/run-tests + Josh Rosen + 2015-06-20 16:04:45 -0700 + Commit: 7a3c424, github.com/apache/spark/pull/6866 + + [SPARK-8468] [ML] Take the negative of some metrics in RegressionEvaluator to get correct cross validation + Liang-Chi Hsieh + 2015-06-20 13:01:59 -0700 + Commit: 0b89951, github.com/apache/spark/pull/6905 + + [SPARK-8127] [STREAMING] [KAFKA] KafkaRDD optimize count() take() isEmpty() + cody koeninger + 2015-06-19 18:54:07 -0700 + Commit: 1b6fe9b, github.com/apache/spark/pull/6632 + + [HOTFIX] [SPARK-8489] Correct JIRA number in previous commit + Andrew Or + 2015-06-19 17:39:26 -0700 + Commit: bec40e5 + + [SPARK-8498] [SQL] Add regression test for SPARK-8470 + Andrew Or + 2015-06-19 17:34:09 -0700 + Commit: 093c348, github.com/apache/spark/pull/6909 + + [SPARK-8390] [STREAMING] [KAFKA] fix docs related to HasOffsetRanges + cody koeninger + 2015-06-19 17:16:56 -0700 + Commit: b305e37, github.com/apache/spark/pull/6863 + + [SPARK-8420] [SQL] Fix comparision of timestamps/dates with strings + Michael Armbrust + 2015-06-19 16:54:51 -0700 + Commit: a333a72, github.com/apache/spark/pull/6888 + + [SPARK-8093] [SQL] Remove empty structs inferred from JSON documents + Nathan Howell + 2015-06-19 16:19:28 -0700 + Commit: 9814b97, github.com/apache/spark/pull/6799 + + [SPARK-8452] [SPARKR] expose jobGroup API in SparkR + Hossein + 2015-06-19 15:47:22 -0700 + Commit: 1fa29c2, github.com/apache/spark/pull/6889 + + [SPARK-4118] [MLLIB] [PYSPARK] Python bindings for StreamingKMeans + MechCoder + 2015-06-19 12:23:15 -0700 + Commit: 54976e5, github.com/apache/spark/pull/6499 + + [SPARK-8461] [SQL] fix codegen with REPL class loader + Davies Liu + 2015-06-19 11:40:04 -0700 + Commit: e41e2fd, github.com/apache/spark/pull/6898 + + [HOTFIX] Fix scala style in DFSReadWriteTest that causes tests failed + Liang-Chi Hsieh + 2015-06-19 11:36:59 -0700 + Commit: 4a462c2, github.com/apache/spark/pull/6907 + + [SPARK-8368] [SPARK-8058] [SQL] HiveContext may override the context class loader of the current thread + Yin Huai + 2015-06-19 11:11:58 -0700 + Commit: c5876e5, github.com/apache/spark/pull/6891 + + [SPARK-5836] [DOCS] [STREAMING] Clarify what may cause long-running Spark apps to preserve shuffle files + Sean Owen + 2015-06-19 11:03:04 -0700 + Commit: 4be53d0, github.com/apache/spark/pull/6901 + + [SPARK-8451] [SPARK-7287] SparkSubmitSuite should check exit code + Andrew Or + 2015-06-19 10:56:19 -0700 + Commit: 68a2dca, github.com/apache/spark/pull/6886 + + [SPARK-7180] [SPARK-8090] [SPARK-8091] Fix a number of SerializationDebugger bugs and limitations + Tathagata Das + 2015-06-19 10:52:30 -0700 + Commit: 866816e, github.com/apache/spark/pull/6625 + + Add example that reads a local file, writes to a DFS path provided by th... + RJ Nowling + 2015-06-19 10:50:44 -0700 + Commit: a985803, github.com/apache/spark/pull/3347 + + [SPARK-8234][SQL] misc function: md5 + Shilei + 2015-06-19 10:49:27 -0700 + Commit: 0c32fc1, github.com/apache/spark/pull/6779 + + [SPARK-8476] [CORE] Setters inc/decDiskBytesSpilled in TaskMetrics should also be private. + Takuya UESHIN + 2015-06-19 10:48:16 -0700 + Commit: fe08561, github.com/apache/spark/pull/6896 + + [SPARK-8430] ExternalShuffleBlockResolver of shuffle service should support UnsafeShuffleManager + Lianhui Wang + 2015-06-19 10:47:07 -0700 + Commit: 9baf093, github.com/apache/spark/pull/6873 + + [SPARK-8207] [SQL] Add math function bin + Liang-Chi Hsieh + 2015-06-19 10:09:31 -0700 + Commit: 2c59d5c, github.com/apache/spark/pull/6721 + + [SPARK-8151] [MLLIB] pipeline components should correctly implement copy + Xiangrui Meng + 2015-06-19 09:46:51 -0700 + Commit: 43c7ec6, github.com/apache/spark/pull/6622 + + [SPARK-8389] [STREAMING] [KAFKA] Example of getting offset ranges out o… + cody koeninger + 2015-06-19 14:51:19 +0200 + Commit: 47af7c1, github.com/apache/spark/pull/6846 + + [SPARK-7265] Improving documentation for Spark SQL Hive support + Jihong MA + 2015-06-19 14:05:11 +0200 + Commit: ebd363a, github.com/apache/spark/pull/5933 + + [SPARK-7913] [CORE] Make AppendOnlyMap use the same growth strategy of OpenHashSet and consistent exception message + zsxwing + 2015-06-19 11:58:07 +0200 + Commit: 93360dc, github.com/apache/spark/pull/6879 + + [SPARK-8387] [FOLLOWUP ] [WEBUI] Update driver log URL to show only 4096 bytes + Carson Wang + 2015-06-19 09:57:12 +0200 + Commit: 54557f3, github.com/apache/spark/pull/6878 + + [SPARK-8339] [PYSPARK] integer division for python 3 + Kevin Conor + 2015-06-19 00:12:20 -0700 + Commit: fdf63f1, github.com/apache/spark/pull/6794 + + [SPARK-8444] [STREAMING] Adding Python streaming example for queueStream + Bryan Cutler + 2015-06-19 00:07:53 -0700 + Commit: a2016b4, github.com/apache/spark/pull/6884 + + [SPARK-8348][SQL] Add in operator to DataFrame Column + Yu ISHIKAWA + 2015-06-18 23:13:05 -0700 + Commit: 754929b, github.com/apache/spark/pull/6824 + + [SPARK-8458] [SQL] Don't strip scheme part of output path when writing ORC files + Cheng Lian + 2015-06-18 22:01:52 -0700 + Commit: a71cbbd, github.com/apache/spark/pull/6892 + + [SPARK-8080] [STREAMING] Receiver.store with Iterator does not give correct count at Spark UI + Dibyendu Bhattacharya , U-PEROOT\UBHATD1 + 2015-06-18 19:58:47 -0700 + Commit: 3eaed87, github.com/apache/spark/pull/6707 + + [SPARK-8462] [DOCS] Documentation fixes for Spark SQL + Lars Francke + 2015-06-18 19:40:32 -0700 + Commit: 4ce3bab, github.com/apache/spark/pull/6890 + + [SPARK-8135] Don't load defaults when reconstituting Hadoop Configurations + Sandy Ryza + 2015-06-18 19:36:05 -0700 + Commit: 43f50de, github.com/apache/spark/pull/6679 + + [SPARK-8218][SQL] Binary log math function update. + Reynold Xin + 2015-06-18 18:41:15 -0700 + Commit: dc41313, github.com/apache/spark/pull/6871 + + [SPARK-8446] [SQL] Add helper functions for testing SparkPlan physical operators + Josh Rosen , Josh Rosen , Michael Armbrust + 2015-06-18 16:45:14 -0700 + Commit: 207a98c, github.com/apache/spark/pull/6885 + + [SPARK-8376] [DOCS] Add common lang3 to the Spark Flume Sink doc + zsxwing + 2015-06-18 16:00:27 -0700 + Commit: 24e5379, github.com/apache/spark/pull/6829 + + [SPARK-8353] [DOCS] Show anchor links when hovering over documentation headers + Josh Rosen + 2015-06-18 15:10:09 -0700 + Commit: 44c931f, github.com/apache/spark/pull/6808 + + [SPARK-8202] [PYSPARK] fix infinite loop during external sort in PySpark + Davies Liu + 2015-06-18 13:45:58 -0700 + Commit: 9b20027, github.com/apache/spark/pull/6714 + + [SPARK-8363][SQL] Move sqrt to math and extend UnaryMathExpression + Liang-Chi Hsieh + 2015-06-18 13:00:31 -0700 + Commit: 3164112, github.com/apache/spark/pull/6823 + + [SPARK-8320] [STREAMING] Add example in streaming programming guide that shows union of multiple input streams + Neelesh Srinivas Salian + 2015-06-18 09:44:36 -0700 + Commit: ddc5baf, github.com/apache/spark/pull/6862 + + [SPARK-8283][SQL] Resolve udf_struct test failure in HiveCompatibilitySuite + Yijie Shen + 2015-06-17 23:46:57 -0700 + Commit: e86fbdb, github.com/apache/spark/pull/6828 + + [SPARK-8218][SQL] Add binary log math function + Liang-Chi Hsieh + 2015-06-17 23:31:30 -0700 + Commit: fee3438, github.com/apache/spark/pull/6725 + + [SPARK-7961][SQL]Refactor SQLConf to display better error message + zsxwing + 2015-06-17 23:22:54 -0700 + Commit: 78a430e, github.com/apache/spark/pull/6747 + + [SPARK-8381][SQL]reuse typeConvert when convert Seq[Row] to catalyst type + Lianhui Wang + 2015-06-17 22:52:47 -0700 + Commit: 9db73ec, github.com/apache/spark/pull/6831 + + [SPARK-8095] Resolve dependencies of --packages in local ivy cache + Burak Yavuz + 2015-06-17 22:33:37 -0700 + Commit: 3b61077, github.com/apache/spark/pull/6788 + + [SPARK-8392] RDDOperationGraph: getting cached nodes is slow + xutingjun + 2015-06-17 22:31:01 -0700 + Commit: e2cdb05, github.com/apache/spark/pull/6839 + + [SPARK-7605] [MLLIB] [PYSPARK] Python API for ElementwiseProduct + MechCoder + 2015-06-17 22:08:38 -0700 + Commit: 22732e1, github.com/apache/spark/pull/6346 + + [SPARK-8373] [PYSPARK] Remove PythonRDD.emptyRDD + zsxwing + 2015-06-17 22:07:16 -0700 + Commit: 4817ccd, github.com/apache/spark/pull/6867 + + [HOTFIX] [PROJECT-INFRA] Fix bug in dev/run-tests for MLlib-only PRs + Josh Rosen + 2015-06-17 19:02:25 -0700 + Commit: 165f52f + + [SPARK-8397] [SQL] Allow custom configuration for TestHive + Punya Biswal + 2015-06-17 15:29:39 -0700 + Commit: d1069cb, github.com/apache/spark/pull/6844 + + [SPARK-8404] [STREAMING] [TESTS] Use thread-safe collections to make the tests more reliable + zsxwing + 2015-06-17 15:00:03 -0700 + Commit: a06d9c8, github.com/apache/spark/pull/6852 + + [SPARK-8306] [SQL] AddJar command needs to set the new class loader to the HiveConf inside executionHive.state. + Yin Huai + 2015-06-17 14:52:43 -0700 + Commit: 302556f, github.com/apache/spark/pull/6758 + + [SPARK-7067] [SQL] fix bug when use complex nested fields in ORDER BY + Wenchen Fan + 2015-06-17 14:46:00 -0700 + Commit: 7f05b1f, github.com/apache/spark/pull/5659 + + [SPARK-7913] [CORE] Increase the maximum capacity of PartitionedPairBuffe, PartitionedSerializedPairBuffer and AppendOnlyMap + zsxwing + 2015-06-17 14:03:15 -0700 + Commit: a411a40, github.com/apache/spark/pull/6456 + + [SPARK-8373] [PYSPARK] Add emptyRDD to pyspark and fix the issue when calling sum on an empty RDD + zsxwing + 2015-06-17 13:59:39 -0700 + Commit: 0fc4b96, github.com/apache/spark/pull/6826 + + [SPARK-8372] History server shows incorrect information for application not started + Carson Wang + 2015-06-17 13:41:36 -0700 + Commit: 2837e06, github.com/apache/spark/pull/6827 + + [SPARK-8161] Set externalBlockStoreInitialized to be true, after ExternalBlockStore is initialized + Mingfei + 2015-06-17 13:40:07 -0700 + Commit: 7ad8c5d, github.com/apache/spark/pull/6702 + + [SPARK-8010] [SQL] Promote types to StringType as implicit conversion in non-binary expression of HiveTypeCoercion + OopsOutOfMemory + 2015-06-17 13:37:59 -0700 + Commit: 98ee351, github.com/apache/spark/pull/6551 + + [SPARK-6782] add sbt-revolver plugin + Imran Rashid + 2015-06-17 13:34:26 -0700 + Commit: a465944, github.com/apache/spark/pull/5426 + + [SPARK-8395] [DOCS] start-slave.sh docs incorrect + Sean Owen + 2015-06-17 13:31:10 -0700 + Commit: f005be0, github.com/apache/spark/pull/6855 + + [SPARK-8077] [SQL] Optimization for TreeNodes with large numbers of children + Michael Davies + 2015-06-17 12:56:55 -0700 + Commit: 0c1b2df, github.com/apache/spark/pull/6673 + + [SPARK-7017] [BUILD] [PROJECT INFRA] Refactor dev/run-tests into Python + Brennon York + 2015-06-17 12:00:34 -0700 + Commit: 50a0496, github.com/apache/spark/pull/5694 + + [SPARK-6390] [SQL] [MLlib] Port MatrixUDT to PySpark + MechCoder + 2015-06-17 11:10:16 -0700 + Commit: 6765ef9, github.com/apache/spark/pull/6354 + + [SPARK-7199] [SQL] Add date and timestamp support to UnsafeRow + Liang-Chi Hsieh + 2015-06-17 09:00:37 -0700 + Commit: 104f30c, github.com/apache/spark/pull/5984 + + [SPARK-8309] [CORE] Support for more than 12M items in OpenHashMap + Vyacheslav Baranov + 2015-06-17 09:42:29 +0100 + Commit: c13da20, github.com/apache/spark/pull/6763 + + Closes #6850. + Reynold Xin + 2015-06-17 00:28:40 -0700 + Commit: e3de14d + + [SPARK-8220][SQL]Add positive identify function + dragonli , zhichao.li + 2015-06-16 23:44:10 -0700 + Commit: bedff7d, github.com/apache/spark/pull/6838 + + [SPARK-8156] [SQL] create table to specific database by 'use dbname' + baishuo + 2015-06-16 16:40:02 -0700 + Commit: 0b8c8fd, github.com/apache/spark/pull/6695 + + [SPARK-7916] [MLLIB] MLlib Python doc parity check for classification and regression + Yanbo Liang + 2015-06-16 14:30:30 -0700 + Commit: ca99875, github.com/apache/spark/pull/6460 + + [SPARK-8126] [BUILD] Make sure temp dir exists when running tests. + Marcelo Vanzin + 2015-06-16 21:10:18 +0100 + Commit: cebf241, github.com/apache/spark/pull/6805 + + [SQL] [DOC] improved a comment + Radek Ostrowski , radek + 2015-06-16 21:04:26 +0100 + Commit: 4bd10fd, github.com/apache/spark/pull/6332 + + [SPARK-DOCS] [SPARK-SQL] Update sql-programming-guide.md + Moussa Taifi + 2015-06-16 20:59:22 +0100 + Commit: dc455b8, github.com/apache/spark/pull/6847 + + [SPARK-8387] [WEBUI] Only show 4096 bytes content for executor log instead of show all + hushan[胡珊] + 2015-06-16 20:48:33 +0100 + Commit: 29c5025, github.com/apache/spark/pull/6834 + + [SPARK-8129] [CORE] [Sec] Pass auth secrets to executors via env variables + Kan Zhang + 2015-06-16 08:18:26 +0200 + Commit: 658814c, github.com/apache/spark/pull/6774 + + [SPARK-8367] [STREAMING] Add a limit for 'spark.streaming.blockInterval` since a data loss bug. + huangzhaowei , huangzhaowei + 2015-06-16 08:16:09 +0200 + Commit: ccf010f, github.com/apache/spark/pull/6818 + + [SPARK-7184] [SQL] enable codegen by default + Davies Liu + 2015-06-15 23:03:14 -0700 + Commit: bc76a0f, github.com/apache/spark/pull/6726 + + SPARK-8336 Fix NullPointerException with functions.rand() + tedyu + 2015-06-15 17:00:38 -0700 + Commit: 1a62d61, github.com/apache/spark/pull/6793 + + [SPARK-6583] [SQL] Support aggregate functions in ORDER BY + Yadong Qi , Michael Armbrust + 2015-06-15 12:01:52 -0700 + Commit: 6ae21a9, github.com/apache/spark/pull/on + + [SPARK-8350] [R] Log R unit test output to "unit-tests.log" + andrewor14 , Andrew Or + 2015-06-15 08:16:22 -0700 + Commit: 56d4e8a, github.com/apache/spark/pull/6807 + + [SPARK-8316] Upgrade to Maven 3.3.3 + Nicholas Chammas + 2015-06-15 08:18:01 +0100 + Commit: 4c5889e, github.com/apache/spark/pull/6770 + + [SPARK-8065] [SQL] Add support for Hive 0.14 metastores + Marcelo Vanzin + 2015-06-14 11:49:16 -0700 + Commit: 4eb48ed, github.com/apache/spark/pull/6627 + + fix read/write mixup + Peter Hoffmann + 2015-06-14 11:41:16 -0700 + Commit: f3f2a43, github.com/apache/spark/pull/6815 + + [SPARK-8362] [SQL] Add unit tests for +, -, *, /, % + Reynold Xin + 2015-06-14 11:23:23 -0700 + Commit: 53c16b9, github.com/apache/spark/pull/6813 + + [SPARK-8358] [SQL] Wait for child resolution when resolving generators + Michael Armbrust + 2015-06-14 11:21:42 -0700 + Commit: 9073a42, github.com/apache/spark/pull/6811 + + [SPARK-8354] [SQL] Fix off-by-factor-of-8 error when allocating scratch space in UnsafeFixedWidthAggregationMap + Josh Rosen + 2015-06-14 09:34:35 -0700 + Commit: ea7fd2f, github.com/apache/spark/pull/6809 + + [SPARK-8342][SQL] Fix Decimal setOrNull + Liang-Chi Hsieh + 2015-06-13 22:42:28 -0700 + Commit: cb7ada1, github.com/apache/spark/pull/6797 + + [Spark-8343] [Streaming] [Docs] Improve Spark Streaming Guides. + Mike Dusenberry + 2015-06-13 21:22:46 -0700 + Commit: 35d1267, github.com/apache/spark/pull/6801 + + [SPARK-8349] [SQL] Use expression constructors (rather than apply) in FunctionRegistry + Reynold Xin + 2015-06-13 18:22:17 -0700 + Commit: 2d71ba4, github.com/apache/spark/pull/6806 + + [SPARK-8347][SQL] Add unit tests for abs. + Reynold Xin + 2015-06-13 17:10:13 -0700 + Commit: a138953, github.com/apache/spark/pull/6803 + + [SPARK-8052] [SQL] Use java.math.BigDecimal for casting String to Decimal instead of using toDouble + Liang-Chi Hsieh + 2015-06-13 16:39:52 -0700 + Commit: ddec452, github.com/apache/spark/pull/6645 + + [SPARK-8319] [CORE] [SQL] Update logic related to key orderings in shuffle dependencies + Josh Rosen + 2015-06-13 16:14:24 -0700 + Commit: af31335, github.com/apache/spark/pull/6773 + + [SPARK-8346] [SQL] Use InternalRow instread of catalyst.InternalRow + Davies Liu + 2015-06-13 16:13:26 -0700 + Commit: ce1041c, github.com/apache/spark/pull/6802 + + [SPARK-7897] Improbe type for jdbc/"unsigned bigint" + Rene Treffer + 2015-06-13 11:58:22 -0700 + Commit: d986fb9, github.com/apache/spark/pull/6789 + + [SPARK-8329][SQL] Allow _ in DataSource options + Michael Armbrust + 2015-06-12 23:11:16 -0700 + Commit: 4aed66f, github.com/apache/spark/pull/6786 + + [SPARK-7186] [SQL] Decouple internal Row from external Row + Davies Liu + 2015-06-12 23:06:31 -0700 + Commit: d46f8e5, github.com/apache/spark/pull/6792 + + [SPARK-8314][MLlib] improvement in performance of MLUtils.appendBias + Roger Menezes + 2015-06-12 18:29:58 -0700 + Commit: 6e9c3ff, github.com/apache/spark/pull/6768 + + [SPARK-7284] [STREAMING] Updated streaming documentation + Tathagata Das + 2015-06-12 15:22:59 -0700 + Commit: e9471d3, github.com/apache/spark/pull/6781 + + [SPARK-8330] DAG visualization: trim whitespace from input + Andrew Or + 2015-06-12 11:14:55 -0700 + Commit: 8860405, github.com/apache/spark/pull/6787 + + [SPARK-7993] [SQL] Improved DataFrame.show() output + akhilthatipamula <130050068@iitb.ac.in>, zsxwing + 2015-06-12 10:40:28 -0700 + Commit: 19834fa, github.com/apache/spark/pull/6633 + + [SPARK-8322] [EC2] Added spark 1.4.0 into the VALID_SPARK_VERSIONS and… + Mark Smith + 2015-06-12 08:19:03 -0700 + Commit: 71cc17b, github.com/apache/spark/pull/6776 + + [SQL] [MINOR] correct semanticEquals logic + Wenchen Fan + 2015-06-12 16:38:28 +0800 + Commit: c19c785, github.com/apache/spark/pull/6261 + + [SPARK-6566] [SQL] Related changes for newer parquet version + Yash Datta + 2015-06-12 13:44:09 +0800 + Commit: e428b3a, github.com/apache/spark/pull/5889 + + [SPARK-7862] [SQL] Fix the deadlock in script transformation for stderr + zhichao.li + 2015-06-11 22:28:28 -0700 + Commit: 2dd7f93, github.com/apache/spark/pull/6404 + + [SPARK-8317] [SQL] Do not push sort into shuffle in Exchange operator + Josh Rosen + 2015-06-11 22:15:15 -0700 + Commit: b9d177c, github.com/apache/spark/pull/6772 + + [SPARK-7158] [SQL] Fix bug of cached data cannot be used in collect() after cache() + Cheng Hao + 2015-06-11 18:01:32 -0700 + Commit: 767cc94, github.com/apache/spark/pull/5714 + + [SQL] Miscellaneous SQL/DF expression changes. + Reynold Xin + 2015-06-11 17:06:21 -0700 + Commit: 337c16d, github.com/apache/spark/pull/6754 + + [SPARK-7824] [SQL] Collapse operator reordering and constant folding into a single batch. + Zhongshuai Pei <799203320@qq.com>, DoingDone9 <799203320@qq.com> + 2015-06-11 17:01:02 -0700 + Commit: 7914c72, github.com/apache/spark/pull/6351 + + [SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package. + Reynold Xin + 2015-06-11 16:07:15 -0700 + Commit: 7d669a5, github.com/apache/spark/pull/6738 + + [SPARK-6511] [docs] Fix example command in hadoop-provided docs. + Marcelo Vanzin + 2015-06-11 15:29:03 -0700 + Commit: 9cbdf31, github.com/apache/spark/pull/6766 + + [SPARK-7444] [TESTS] Eliminate noisy css warn/error logs for UISeleniumSuite + zsxwing + 2015-06-11 14:21:49 -0700 + Commit: 95690a1, github.com/apache/spark/pull/5983 + + [SPARK-7915] [SQL] Support specifying the column list for target table in CTAS + Cheng Hao + 2015-06-11 14:03:08 -0700 + Commit: 040f223, github.com/apache/spark/pull/6458 + + [SPARK-8310] [EC2] Updates the master branch EC2 versions + Shivaram Venkataraman + 2015-06-11 13:18:42 -0700 + Commit: c8d551d, github.com/apache/spark/pull/6764 + + [SPARK-8305] [SPARK-8190] [SQL] improve codegen + Davies Liu + 2015-06-11 12:57:33 -0700 + Commit: 1191c3e, github.com/apache/spark/pull/6755 + + [SPARK-6411] [SQL] [PySpark] support date/datetime with timezone in Python + Davies Liu + 2015-06-11 01:00:41 -0700 + Commit: 424b007, github.com/apache/spark/pull/6250 + + [SPARK-8289] Specify stack size for consistency with Java tests - resolves test failures + Adam Roberts , a-roberts + 2015-06-11 08:40:46 +0100 + Commit: 6b68366, github.com/apache/spark/pull/6727 + + [HOTFIX] Fixing errors in name mappings + Patrick Wendell + 2015-06-10 22:56:36 -0700 + Commit: e84545f + + [HOTFIX] Adding more contributor name bindings + Patrick Wendell + 2015-06-10 21:13:47 -0700 + Commit: a777eb0 + + [SPARK-8217] [SQL] math function log2 + Daoyuan Wang + 2015-06-10 20:22:32 -0700 + Commit: 2758ff0, github.com/apache/spark/pull/6718 + + [SPARK-8248][SQL] string function: length + Cheng Hao + 2015-06-10 19:55:10 -0700 + Commit: 9fe3adc, github.com/apache/spark/pull/6724 + + [SPARK-8164] transformExpressions should support nested expression sequence + Wenchen Fan + 2015-06-10 18:22:47 -0700 + Commit: 4e42842, github.com/apache/spark/pull/6706 + + [SPARK-8285] [SQL] CombineSum should be calculated as unlimited decimal first + navis.ryu + 2015-06-10 18:19:12 -0700 + Commit: 6a47114, github.com/apache/spark/pull/6736 + + [SPARK-8189] [SQL] use Long for TimestampType in SQL + Davies Liu + 2015-06-10 16:55:39 -0700 + Commit: 37719e0, github.com/apache/spark/pull/6733 + + [SPARK-8200] [MLLIB] Check for empty RDDs in StreamingLinearAlgorithm + Paavo + 2015-06-10 23:17:42 +0100 + Commit: b928f54, github.com/apache/spark/pull/6713 + + [SPARK-2774] Set preferred locations for reduce tasks + Shivaram Venkataraman + 2015-06-10 15:03:40 -0700 + Commit: 96a7c88, github.com/apache/spark/pull/6652 + + [SPARK-8273] Driver hangs up when yarn shutdown in client mode + WangTaoTheTonic + 2015-06-10 13:34:19 -0700 + Commit: 5014d0e, github.com/apache/spark/pull/6717 + + [SPARK-8290] spark class command builder need read SPARK_JAVA_OPTS and SPARK_DRIVER_MEMORY properly + WangTaoTheTonic , Tao Wang + 2015-06-10 13:30:16 -0700 + Commit: cb871c4, github.com/apache/spark/pull/6741 + + [SPARK-7261] [CORE] Change default log level to WARN in the REPL + zsxwing + 2015-06-10 13:25:59 -0700 + Commit: 80043e9, github.com/apache/spark/pull/6734 + + [SPARK-7527] [CORE] Fix createNullValue to return the correct null values and REPL mode detection + zsxwing + 2015-06-10 13:22:52 -0700 + Commit: e90c9d9, github.com/apache/spark/pull/6735 + + [SPARK-7756] CORE RDDOperationScope fix for IBM Java + Adam Roberts , a-roberts + 2015-06-10 13:21:01 -0700 + Commit: 19e30b4, github.com/apache/spark/pull/6740 + + [SPARK-8282] [SPARKR] Make number of threads used in RBackend configurable + Hossein + 2015-06-10 13:18:48 -0700 + Commit: 30ebf1a, github.com/apache/spark/pull/6730 + + [SPARK-5479] [YARN] Handle --py-files correctly in YARN. + Marcelo Vanzin + 2015-06-10 13:17:29 -0700 + Commit: 3811290, github.com/apache/spark/pull/6360 + + [SQL] [MINOR] Fixes a minor Java example error in SQL programming guide + Cheng Lian + 2015-06-10 11:48:14 -0700 + Commit: 8f7308f, github.com/apache/spark/pull/6749 + + [SPARK-7996] Deprecate the developer api SparkEnv.actorSystem + Ilya Ganelin + 2015-06-10 11:21:12 -0700 + Commit: 2b550a5, github.com/apache/spark/pull/6731 + + [SPARK-8215] [SPARK-8212] [SQL] add leaf math expression for e and pi + Daoyuan Wang + 2015-06-10 09:45:45 -0700 + Commit: c6ba7cc, github.com/apache/spark/pull/6716 + + [SPARK-7886] Added unit test for HAVING aggregate pushdown. + Reynold Xin + 2015-06-10 18:58:01 +0800 + Commit: e90035e, github.com/apache/spark/pull/6739 + + [SPARK-7886] Use FunctionRegistry for built-in expressions in HiveContext. + Reynold Xin + 2015-06-10 00:36:16 -0700 + Commit: 57c60c5, github.com/apache/spark/pull/6712 + + [SPARK-7792] [SQL] HiveContext registerTempTable not thread safe + navis.ryu + 2015-06-09 19:33:00 -0700 + Commit: 778f3ca, github.com/apache/spark/pull/6699 + + [SPARK-6511] [DOCUMENTATION] Explain how to use Hadoop provided builds + Patrick Wendell + 2015-06-09 16:14:21 -0700 + Commit: 6e4fb0c, github.com/apache/spark/pull/6729 + + [MINOR] [UI] DAG visualization: trim whitespace from input + Andrew Or + 2015-06-09 15:44:02 -0700 + Commit: 0d5892d, github.com/apache/spark/pull/6732 + + [SPARK-8274] [DOCUMENTATION-MLLIB] Fix wrong URLs in MLlib Frequent Pattern Mining Documentation + FavioVazquez + 2015-06-09 15:02:18 +0100 + Commit: 490d5a7, github.com/apache/spark/pull/6722 + + [SPARK-8140] [MLLIB] Remove construct to get weights in StreamingLinearAlgorithm + MechCoder + 2015-06-09 15:00:35 +0100 + Commit: 6c1723a, github.com/apache/spark/pull/6720 + + [STREAMING] [DOC] Remove duplicated description about WAL + Kousuke Saruta + 2015-06-09 12:19:01 +0100 + Commit: e6fb6ce, github.com/apache/spark/pull/6719 + + [SPARK-7886] Add built-in expressions to FunctionRegistry. + Reynold Xin , Santiago M. Mola + 2015-06-09 16:24:38 +0800 + Commit: 1b49999, github.com/apache/spark/pull/6710 + + [SPARK-8101] [CORE] Upgrade netty to avoid memory leak accord to netty #3837 issues + Sean Owen + 2015-06-09 08:00:04 +0100 + Commit: 0902a11, github.com/apache/spark/pull/6701 + + [SPARK-7990][SQL] Add methods to facilitate equi-join on multiple joining keys + Liang-Chi Hsieh + 2015-06-08 23:27:05 -0700 + Commit: 7658eb2, github.com/apache/spark/pull/6616 + + [SPARK-6820] [SPARKR] Convert NAs to null type in SparkR DataFrames + hqzizania + 2015-06-08 21:40:12 -0700 + Commit: a5c52c1, github.com/apache/spark/pull/6190 + + [SPARK-8168] [MLLIB] Add Python friendly constructor to PipelineModel + Xiangrui Meng + 2015-06-08 21:33:47 -0700 + Commit: 82870d5, github.com/apache/spark/pull/6709 + + [SPARK-8162] [HOTFIX] Fix NPE in spark-shell + Andrew Or + 2015-06-08 18:09:21 -0700 + Commit: f3eec92, github.com/apache/spark/pull/6711 + + [SPARK-8148] Do not use FloatType in partition column inference. + Reynold Xin + 2015-06-08 13:15:44 -0700 + Commit: 5185389, github.com/apache/spark/pull/6692 + + [SQL][minor] remove duplicated cases in `DecimalPrecision` + Wenchen Fan + 2015-06-08 11:52:02 -0700 + Commit: fe7669d, github.com/apache/spark/pull/6698 + + [SPARK-8121] [SQL] Fixes InsertIntoHadoopFsRelation job initialization for Hadoop 1.x + Cheng Lian + 2015-06-08 11:34:18 -0700 + Commit: bbdfc0a, github.com/apache/spark/pull/6669 + + [SPARK-8158] [SQL] several fix for HiveShim + Daoyuan Wang + 2015-06-08 11:06:27 -0700 + Commit: ed5c2dc, github.com/apache/spark/pull/6700 + + [MINOR] change new Exception to IllegalArgumentException + Daoyuan Wang + 2015-06-08 09:41:06 -0700 + Commit: 49f19b9, github.com/apache/spark/pull/6434 + + [SMALL FIX] Return null if catch EOFException + Mingfei + 2015-06-08 16:23:43 +0100 + Commit: 149d1b2, github.com/apache/spark/pull/6703 + + [SPARK-8140] [MLLIB] Remove empty model check in StreamingLinearAlgorithm + MechCoder + 2015-06-08 15:45:12 +0100 + Commit: e3e9c70, github.com/apache/spark/pull/6684 + + [SPARK-8126] [BUILD] Use custom temp directory during build. + Marcelo Vanzin + 2015-06-08 15:37:28 +0100 + Commit: a1d9e5c, github.com/apache/spark/pull/6674 + + [SPARK-7939] [SQL] Add conf to enable/disable partition column type inference + Liang-Chi Hsieh + 2015-06-08 17:50:38 +0800 + Commit: 03ef6be, github.com/apache/spark/pull/6503 + + [SPARK-7705] [YARN] Cleanup of .sparkStaging directory fails if application is killed + linweizhong + 2015-06-08 09:34:16 +0100 + Commit: eacd4a9, github.com/apache/spark/pull/6409 + + [SPARK-4761] [DOC] [SQL] kryo default setting in SQL Thrift server + Daoyuan Wang + 2015-06-08 01:07:50 -0700 + Commit: 10fc2f6, github.com/apache/spark/pull/6639 + + [SPARK-8154][SQL] Remove Term/Code type aliases in code generation. + Reynold Xin + 2015-06-07 23:16:19 -0700 + Commit: 72ba0fc, github.com/apache/spark/pull/6694 + + [SPARK-8149][SQL] Break ExpressionEvaluationSuite down to multiple files + Reynold Xin + 2015-06-07 18:45:24 -0700 + Commit: f74be74, github.com/apache/spark/pull/6693 + + [SPARK-8117] [SQL] Push codegen implementation into each Expression + Davies Liu , Reynold Xin + 2015-06-07 14:11:20 -0700 + Commit: 5e7b6b6, github.com/apache/spark/pull/6690 + + [SPARK-2808] [STREAMING] [KAFKA] cleanup tests from + cody koeninger + 2015-06-07 21:42:45 +0100 + Commit: b127ff8, github.com/apache/spark/pull/5921 + + [SPARK-7733] [CORE] [BUILD] Update build, code to use Java 7 for 1.5.0+ + Sean Owen + 2015-06-07 20:18:13 +0100 + Commit: e84815d, github.com/apache/spark/pull/6265 + + [SPARK-7952][SQL] use internal Decimal instead of java.math.BigDecimal + Wenchen Fan + 2015-06-07 11:07:19 -0700 + Commit: db81b9d, github.com/apache/spark/pull/6574 + + [SPARK-8004][SQL] Quote identifier in JDBC data source. + Reynold Xin + 2015-06-07 10:52:02 -0700 + Commit: d6d601a, github.com/apache/spark/pull/6689 + + [DOC] [TYPO] Fix typo in standalone deploy scripts description + Yijie Shen + 2015-06-07 15:30:37 +0100 + Commit: 835f138, github.com/apache/spark/pull/6691 + + [SPARK-7042] [BUILD] use the standard akka artifacts with hadoop-2.x + Konstantin Shaposhnikov + 2015-06-07 13:41:00 +0100 + Commit: ca8dafc, github.com/apache/spark/pull/6492 + + [SPARK-8118] [SQL] Mutes noisy Parquet log output reappeared after upgrading Parquet to 1.7.0 + Cheng Lian + 2015-06-07 16:59:55 +0800 + Commit: 8c321d6, github.com/apache/spark/pull/6670 + + [SPARK-8146] DataFrame Python API: Alias replace in df.na + Reynold Xin + 2015-06-07 01:21:02 -0700 + Commit: 0ac4708, github.com/apache/spark/pull/6688 + + [SPARK-8141] [SQL] Precompute datatypes for partition columns and reuse it + Liang-Chi Hsieh + 2015-06-07 15:33:48 +0800 + Commit: 26d07f1, github.com/apache/spark/pull/6687 + + [SPARK-8145] [WEBUI] Trigger a double click on the span to show full job description. + 979969786 + 2015-06-06 23:15:27 -0700 + Commit: 081db94, github.com/apache/spark/pull/6646 + + [SPARK-8004][SQL] Enclose column names by JDBC Dialect + Liang-Chi Hsieh + 2015-06-06 22:59:31 -0700 + Commit: 901a552, github.com/apache/spark/pull/6577 + + [SPARK-7955] [CORE] Ensure executors with cached RDD blocks are not re… + Hari Shreedharan + 2015-06-06 21:13:26 -0700 + Commit: 3285a51, github.com/apache/spark/pull/6508 + + [SPARK-8136] [YARN] Fix flakiness in YarnClusterSuite. + Hari Shreedharan + 2015-06-06 21:09:56 -0700 + Commit: ed2cc3e, github.com/apache/spark/pull/6680 + + [SPARK-7169] [CORE] Allow metrics system to be configured through SparkConf. + Marcelo Vanzin , Jacek Lewandowski + 2015-06-06 21:08:36 -0700 + Commit: 18c4fce, github.com/apache/spark/pull/6560 + + [SPARK-7639] [PYSPARK] [MLLIB] Python API for KernelDensity + MechCoder + 2015-06-06 14:52:14 -0700 + Commit: 5aa804f, github.com/apache/spark/pull/6387 + + [SPARK-8079] [SQL] Makes InsertIntoHadoopFsRelation job/task abortion more robust + Cheng Lian + 2015-06-06 17:23:12 +0800 + Commit: 16fc496, github.com/apache/spark/pull/6612 + + [SPARK-6973] remove skipped stage ID from completed set on the allJobsPage + Xu Tingjun , Xutingjun , meiyoula <1039320815@qq.com> + 2015-06-06 09:53:53 +0100 + Commit: a8077e5, github.com/apache/spark/pull/5550 + + [SPARK-8114][SQL] Remove some wildcard import on TestSQLContext._ round 3. + Reynold Xin + 2015-06-05 23:15:10 -0700 + Commit: a71be0a, github.com/apache/spark/pull/6677 + + [SPARK-6964] [SQL] Support Cancellation in the Thrift Server + Dong Wang + 2015-06-05 17:41:12 -0700 + Commit: eb19d3f, github.com/apache/spark/pull/6207 + + [SPARK-8114][SQL] Remove some wildcard import on TestSQLContext._ cont'd. + Reynold Xin + 2015-06-05 13:57:21 -0700 + Commit: 6ebe419, github.com/apache/spark/pull/6667 + + [SPARK-7991] [PySpark] Adding support for passing lists to describe. + amey + 2015-06-05 13:49:33 -0700 + Commit: 356a4a9, github.com/apache/spark/pull/6655 + + [SPARK-7747] [SQL] [DOCS] spark.sql.planner.externalSort + Luca Martinetti + 2015-06-05 13:40:11 -0700 + Commit: 4060526, github.com/apache/spark/pull/6272 + + [SPARK-8112] [STREAMING] Fix the negative event count issue + zsxwing + 2015-06-05 12:46:02 -0700 + Commit: 4f16d3f, github.com/apache/spark/pull/6659 + + [SPARK-7699] [CORE] Lazy start the scheduler for dynamic allocation + jerryshao + 2015-06-05 12:28:37 -0700 + Commit: 3f80bc8, github.com/apache/spark/pull/6430 + + [SPARK-8099] set executor cores into system in yarn-cluster mode + Xutingjun , xutingjun + 2015-06-05 11:41:39 -0700 + Commit: 0992a0a, github.com/apache/spark/pull/6643 + + Revert "[MINOR] [BUILD] Use custom temp directory during build." + Andrew Or + 2015-06-05 10:53:32 -0700 + Commit: 4036d05 + + [SPARK-8085] [SPARKR] Support user-specified schema in read.df + Shivaram Venkataraman + 2015-06-05 10:19:03 -0700 + Commit: 12f5eae, github.com/apache/spark/pull/6620 + + [SQL] Simplifies binary node pattern matching + Cheng Lian + 2015-06-05 23:06:19 +0800 + Commit: bc0d76a, github.com/apache/spark/pull/6537 + + [SPARK-6324] [CORE] Centralize handling of script usage messages. + Marcelo Vanzin + 2015-06-05 14:32:00 +0200 + Commit: 700312e, github.com/apache/spark/pull/5841 + + [STREAMING] Update streaming-kafka-integration.md + Akhil Das + 2015-06-05 14:23:23 +0200 + Commit: 019dc9f, github.com/apache/spark/pull/6666 + + [MINOR] [BUILD] Use custom temp directory during build. + Marcelo Vanzin + 2015-06-05 14:11:38 +0200 + Commit: b16b543, github.com/apache/spark/pull/6653 + + [MINOR] [BUILD] Change link to jenkins builds on github. + Marcelo Vanzin + 2015-06-05 10:32:33 +0200 + Commit: da20c8c, github.com/apache/spark/pull/6664 + + [MINOR] remove unused interpolation var in log message + Sean Owen + 2015-06-05 00:32:46 -0700 + Commit: 3a5c4da, github.com/apache/spark/pull/6650 + + [DOC][Minor]Specify the common sources available for collecting + Yijie Shen + 2015-06-05 07:45:25 +0200 + Commit: 2777ed3, github.com/apache/spark/pull/6641 + + [SPARK-8116][PYSPARK] Allow sc.range() to take a single argument. + Ted Blackman + 2015-06-04 22:21:11 -0700 + Commit: e505460, github.com/apache/spark/pull/6656 + + [SPARK-8114][SQL] Remove some wildcard import on TestSQLContext._ + Reynold Xin + 2015-06-04 22:15:58 -0700 + Commit: 8f16b94, github.com/apache/spark/pull/6661 + + [SPARK-8106] [SQL] Set derby.system.durability=test to speed up Hive compatibility tests + Josh Rosen + 2015-06-04 17:33:24 -0700 + Commit: 74dc2a9, github.com/apache/spark/pull/6651 + + [SPARK-8098] [WEBUI] Show correct length of bytes on log page + Carson Wang + 2015-06-04 16:24:50 -0700 + Commit: 63bc0c4, github.com/apache/spark/pull/6640 + + [SPARK-7440][SQL] Remove physical Distinct operator in favor of Aggregate + Reynold Xin + 2015-06-04 13:52:53 -0700 + Commit: 2bcdf8c, github.com/apache/spark/pull/6637 + + Fixed style issues for [SPARK-6909][SQL] Remove Hive Shim code. + Reynold Xin + 2015-06-04 13:44:47 -0700 + Commit: 6593842 + + [SPARK-6909][SQL] Remove Hive Shim code + Cheolsoo Park + 2015-06-04 13:27:35 -0700 + Commit: 0526fea, github.com/apache/spark/pull/6604 + + [SPARK-8027] [SPARKR] Move man pages creation to install-dev.sh + Shivaram Venkataraman + 2015-06-04 12:52:16 -0700 + Commit: 3dc0052, github.com/apache/spark/pull/6593 + + [SPARK-7743] [SQL] Parquet 1.7 + Thomas Omans + 2015-06-04 11:32:03 -0700 + Commit: cd3176b, github.com/apache/spark/pull/6597 + + [SPARK-7969] [SQL] Added a DataFrame.drop function that accepts a Column reference. + Mike Dusenberry + 2015-06-04 11:30:07 -0700 + Commit: df7da07, github.com/apache/spark/pull/6585 + + [SPARK-7956] [SQL] Use Janino to compile SQL expressions into bytecode + Davies Liu + 2015-06-04 10:28:59 -0700 + Commit: c8709dc, github.com/apache/spark/pull/6479 + + Fix maxTaskFailures comment + Daniel Darabos + 2015-06-04 13:46:49 +0200 + Commit: 10ba188, github.com/apache/spark/pull/6621 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-06-03 23:45:06 -0700 + Commit: 9982d45, github.com/apache/spark/pull/5976 + + [BUILD] Fix Maven build for Kinesis + Andrew Or + 2015-06-03 20:45:31 -0700 + Commit: 984ad60 + + [BUILD] Use right branch when checking against Hive + Andrew Or + 2015-06-03 18:08:53 -0700 + Commit: 9cf740f, github.com/apache/spark/pull/6629 + + [BUILD] Increase Jenkins test timeout + Andrew Or + 2015-06-03 17:40:14 -0700 + Commit: e35cd36 + + [SPARK-8084] [SPARKR] Make SparkR scripts fail on error + Shivaram Venkataraman + 2015-06-03 17:02:16 -0700 + Commit: 0576c3c, github.com/apache/spark/pull/6623 + + [SPARK-8088] don't attempt to lower number of executors by 0 + Ryan Williams + 2015-06-03 16:54:46 -0700 + Commit: 51898b5, github.com/apache/spark/pull/6624 + + [HOTFIX] History Server API docs error fix. + Hari Shreedharan + 2015-06-03 16:53:57 -0700 + Commit: 566cb59, github.com/apache/spark/pull/6628 + + [HOTFIX] [TYPO] Fix typo in #6546 + Andrew Or + 2015-06-03 16:04:02 -0700 + Commit: bfbdab1 + + [SPARK-6164] [ML] CrossValidatorModel should keep stats from fitting + leahmcguire + 2015-06-03 15:46:38 -0700 + Commit: d8662cd, github.com/apache/spark/pull/5915 + + [SPARK-8051] [MLLIB] make StringIndexerModel silent if input column does not exist + Xiangrui Meng + 2015-06-03 15:16:24 -0700 + Commit: 26c9d7a, github.com/apache/spark/pull/6595 + + [SPARK-3674] [EC2] Clear SPARK_WORKER_INSTANCES when using YARN + Shivaram Venkataraman + 2015-06-03 15:14:38 -0700 + Commit: d3e026f, github.com/apache/spark/pull/6424 + + [HOTFIX] Fix Hadoop-1 build caused by #5792. + Hari Shreedharan + 2015-06-03 15:11:02 -0700 + Commit: a8f1f15, github.com/apache/spark/pull/6619 + + [SPARK-7989] [CORE] [TESTS] Fix flaky tests in ExternalShuffleServiceSuite and SparkListenerWithClusterSuite + zsxwing + 2015-06-03 15:04:20 -0700 + Commit: f271347, github.com/apache/spark/pull/6546 + + [SPARK-8001] [CORE] Make AsynchronousListenerBus.waitUntilEmpty throw TimeoutException if timeout + zsxwing + 2015-06-03 15:03:07 -0700 + Commit: 1d8669f, github.com/apache/spark/pull/6550 + + [SPARK-8059] [YARN] Wake up allocation thread when new requests arrive. + Marcelo Vanzin + 2015-06-03 14:59:30 -0700 + Commit: aa40c44, github.com/apache/spark/pull/6600 + + [SPARK-8083] [MESOS] Use the correct base path in mesos driver page. + Timothy Chen + 2015-06-03 14:57:23 -0700 + Commit: bfbf12b, github.com/apache/spark/pull/6615 + + [MINOR] [UI] Improve confusing message on log page + Andrew Or + 2015-06-03 12:10:12 -0700 + Commit: c6a6dd0 + + [SPARK-8054] [MLLIB] Added several Java-friendly APIs + unit tests + Joseph K. Bradley + 2015-06-03 14:34:20 -0700 + Commit: 20a26b5, github.com/apache/spark/pull/6562 + + Update documentation for [SPARK-7980] [SQL] Support SQLContext.range(end) + Reynold Xin + 2015-06-03 14:19:10 -0700 + Commit: 2c5a06c + + [SPARK-8074] Parquet should throw AnalysisException during setup for data type/name related failures. + Reynold Xin + 2015-06-03 13:57:57 -0700 + Commit: 939e4f3, github.com/apache/spark/pull/6608 + + [SPARK-8063] [SPARKR] Spark master URL conflict between MASTER env variable and --master command line option. + Sun Rui + 2015-06-03 11:56:35 -0700 + Commit: 708c63b, github.com/apache/spark/pull/6605 + + [SPARK-7161] [HISTORY SERVER] Provide REST api to download event logs fro... + Hari Shreedharan + 2015-06-03 13:43:13 -0500 + Commit: d2a86eb, github.com/apache/spark/pull/5792 + + [SPARK-7980] [SQL] Support SQLContext.range(end) + animesh + 2015-06-03 11:28:18 -0700 + Commit: d053a31, github.com/apache/spark/pull/6609 + + [SPARK-7801] [BUILD] Updating versions to SPARK 1.5.0 + Patrick Wendell + 2015-06-03 10:11:27 -0700 + Commit: 2c4d550, github.com/apache/spark/pull/6328 + + [SPARK-7973] [SQL] Increase the timeout of two CliSuite tests. + Yin Huai + 2015-06-03 09:26:21 -0700 + Commit: f1646e1, github.com/apache/spark/pull/6525 + + [SPARK-7983] [MLLIB] Add require for one-based indices in loadLibSVMFile + Yuhao Yang + 2015-06-03 13:15:57 +0200 + Commit: 28dbde3, github.com/apache/spark/pull/6538 + + [SPARK-7562][SPARK-6444][SQL] Improve error reporting for expression data type mismatch + Wenchen Fan + 2015-06-03 00:47:52 -0700 + Commit: d38cf21, github.com/apache/spark/pull/6405 + + [SPARK-8060] Improve DataFrame Python test coverage and documentation. + Reynold Xin + 2015-06-03 00:23:34 -0700 + Commit: ce320cb, github.com/apache/spark/pull/6601 + + [SPARK-8032] [PYSPARK] Make version checking for NumPy in MLlib more robust + MechCoder + 2015-06-02 23:24:47 -0700 + Commit: 452eb82, github.com/apache/spark/pull/6579 + + [SPARK-8043] [MLLIB] [DOC] update NaiveBayes and SVM examples in doc + Yuhao Yang + 2015-06-02 23:15:38 -0700 + Commit: 43adbd5, github.com/apache/spark/pull/6584 + + [MINOR] make the launcher project name consistent with others + WangTaoTheTonic + 2015-06-02 22:59:48 -0700 + Commit: ccaa823, github.com/apache/spark/pull/6603 + + [SPARK-8053] [MLLIB] renamed scalingVector to scalingVec + Joseph K. Bradley + 2015-06-02 22:56:56 -0700 + Commit: 07c16cb, github.com/apache/spark/pull/6596 + + [SPARK-7691] [SQL] Refactor CatalystTypeConverter to use type-specific row accessors + Josh Rosen + 2015-06-02 22:11:03 -0700 + Commit: cafd505, github.com/apache/spark/pull/6222 + + [SPARK-7547] [ML] Scala Example code for ElasticNet + DB Tsai + 2015-06-02 19:12:08 -0700 + Commit: a86b3e9, github.com/apache/spark/pull/6576 + + [SPARK-7387] [ML] [DOC] CrossValidator example code in Python + Ram Sriharsha + 2015-06-02 18:53:04 -0700 + Commit: c3f4c32, github.com/apache/spark/pull/6358 + + [SQL] [TEST] [MINOR] Follow-up of PR #6493, use Guava API to ensure Java 6 friendliness + Cheng Lian + 2015-06-02 17:07:13 -0700 + Commit: 5cd6a63, github.com/apache/spark/pull/6547 + + [SPARK-8049] [MLLIB] drop tmp col from OneVsRest output + Xiangrui Meng + 2015-06-02 16:51:17 -0700 + Commit: 89f21f6, github.com/apache/spark/pull/6592 + + [SPARK-8038] [SQL] [PYSPARK] fix Column.when() and otherwise() + Davies Liu + 2015-06-02 13:38:06 -0700 + Commit: 605ddbb, github.com/apache/spark/pull/6590 + + [SPARK-8014] [SQL] Avoid premature metadata discovery when writing a HadoopFsRelation with a save mode other than Append + Cheng Lian + 2015-06-02 13:32:13 -0700 + Commit: 686a45f, github.com/apache/spark/pull/6583 + + [SPARK-7985] [ML] [MLlib] [Docs] Remove "fittingParamMap" references. Updating ML Doc "Estimator, Transformer, and Param" examples. + Mike Dusenberry + 2015-06-02 12:38:14 -0700 + Commit: ad06727, github.com/apache/spark/pull/6514 + + [SPARK-8015] [FLUME] Remove Guava dependency from flume-sink. + Marcelo Vanzin + 2015-06-02 11:20:33 -0700 + Commit: 0071bd8, github.com/apache/spark/pull/6555 + + [SPARK-8037] [SQL] Ignores files whose name starts with dot in HadoopFsRelation + Cheng Lian + 2015-06-03 00:59:50 +0800 + Commit: 1bb5d71, github.com/apache/spark/pull/6581 + + [SPARK-7432] [MLLIB] fix flaky CrossValidator doctest + Xiangrui Meng + 2015-06-02 08:51:00 -0700 + Commit: bd97840, github.com/apache/spark/pull/6572 + + [SPARK-8021] [SQL] [PYSPARK] make Python read/write API consistent with Scala + Davies Liu + 2015-06-02 08:37:18 -0700 + Commit: 445647a, github.com/apache/spark/pull/6578 + + [SPARK-8023][SQL] Add "deterministic" attribute to Expression to avoid collapsing nondeterministic projects. + Yin Huai , Reynold Xin + 2015-06-02 00:20:52 -0700 + Commit: 0f80990, github.com/apache/spark/pull/6573 + + [SPARK-8020] [SQL] Spark SQL conf in spark-defaults.conf make metadataHive get constructed too early + Yin Huai + 2015-06-02 00:16:56 -0700 + Commit: 7b7f7b6, github.com/apache/spark/pull/6571 + + [SPARK-6917] [SQL] DecimalType is not read back when non-native type exists + Davies Liu + 2015-06-01 23:12:29 -0700 + Commit: bcb47ad, github.com/apache/spark/pull/6558 + + [SPARK-7582] [MLLIB] user guide for StringIndexer + Xiangrui Meng + 2015-06-01 22:03:29 -0700 + Commit: 0221c7f, github.com/apache/spark/pull/6561 + + Fixed typo in the previous commit. + Reynold Xin + 2015-06-01 21:41:53 -0700 + Commit: b53a011 + + [SPARK-7965] [SPARK-7972] [SQL] Handle expressions containing multiple window expressions and make parser match window frames in case insensitive way + Yin Huai + 2015-06-01 21:40:17 -0700 + Commit: e797dba, github.com/apache/spark/pull/6524 + + [SPARK-8025][Streaming]Add JavaDoc style deprecation for deprecated Streaming methods + zsxwing + 2015-06-01 21:36:49 -0700 + Commit: 7f74bb3, github.com/apache/spark/pull/6564 + + Revert "[SPARK-8020] Spark SQL in spark-defaults.conf make metadataHive get constructed too early" + Reynold Xin + 2015-06-01 21:35:55 -0700 + Commit: 75dda33 + + [SPARK-8020] Spark SQL in spark-defaults.conf make metadataHive get constructed too early + Yin Huai + 2015-06-01 21:33:57 -0700 + Commit: 91f6be8, github.com/apache/spark/pull/6563 + + [minor doc] Add exploratory data analysis warning for DataFrame.stat.freqItem API + Reynold Xin + 2015-06-01 21:29:39 -0700 + Commit: 4c868b9, github.com/apache/spark/pull/6569 + + [SPARK-8027] [SPARKR] Add maven profile to build R package docs + Shivaram Venkataraman + 2015-06-01 21:21:45 -0700 + Commit: cae9306, github.com/apache/spark/pull/6567 + + [SPARK-8026][SQL] Add Column.alias to Scala/Java DataFrame API + Reynold Xin + 2015-06-01 21:13:15 -0700 + Commit: 89f642a, github.com/apache/spark/pull/6565 + + [SPARK-7982][SQL] DataFrame.stat.crosstab should use 0 instead of null for pairs that don't appear + Reynold Xin + 2015-06-01 21:11:19 -0700 + Commit: 6396cc0, github.com/apache/spark/pull/6566 + + [SPARK-8028] [SPARKR] Use addJar instead of setJars in SparkR + Shivaram Venkataraman + 2015-06-01 21:01:14 -0700 + Commit: 6b44278, github.com/apache/spark/pull/6568 + + [MINOR] [UI] Improve error message on log page + Andrew Or + 2015-06-01 19:39:03 -0700 + Commit: 15d7c90 + + [SPARK-7958] [STREAMING] Handled exception in StreamingContext.start() to prevent leaking of actors + Tathagata Das + 2015-06-01 20:04:57 -0700 + Commit: 2f9c751, github.com/apache/spark/pull/6559 + + [SPARK-7584] [MLLIB] User guide for VectorAssembler + Xiangrui Meng + 2015-06-01 15:05:14 -0700 + Commit: 90c6069, github.com/apache/spark/pull/6556 + + [SPARK-7497] [PYSPARK] [STREAMING] fix streaming flaky tests + Davies Liu + 2015-06-01 14:40:08 -0700 + Commit: b7ab029, github.com/apache/spark/pull/6239 + + [DOC] Minor modification to Streaming docs with regards to parallel data receiving + Nishkam Ravi , nishkamravi2 , nravi + 2015-06-01 21:34:41 +0100 + Commit: e7c7e51, github.com/apache/spark/pull/6544 + + Update README to include DataFrames and zinc. + Reynold Xin + 2015-05-31 23:55:45 -0700 + Commit: 3c01568, github.com/apache/spark/pull/6548 + + [SPARK-7952][SPARK-7984][SQL] equality check between boolean type and numeric type is broken. + Wenchen Fan + 2015-05-31 21:01:46 -0700 + Commit: a0e46a0, github.com/apache/spark/pull/6505 + + [SPARK-7978] [SQL] [PYSPARK] DecimalType should not be singleton + Davies Liu + 2015-05-31 19:55:57 -0700 + Commit: 91777a1, github.com/apache/spark/pull/6532 + + [SPARK-7986] Split scalastyle config into 3 sections. + Reynold Xin + 2015-05-31 18:04:57 -0700 + Commit: 6f006b5, github.com/apache/spark/pull/6543 + + [MINOR] Enable PySpark SQL readerwriter and window tests + Josh Rosen + 2015-05-31 15:17:05 -0700 + Commit: 9126ea4, github.com/apache/spark/pull/6542 + + [SPARK-7227] [SPARKR] Support fillna / dropna in R DataFrame. + Sun Rui + 2015-05-31 15:01:21 -0700 + Commit: 46576ab, github.com/apache/spark/pull/6183 + + [SPARK-3850] Turn style checker on for trailing whitespaces. + Reynold Xin + 2015-05-31 14:23:42 -0700 + Commit: 866652c, github.com/apache/spark/pull/6541 + + [SPARK-7949] [MLLIB] [DOC] update document with some missing save/load + Yuhao Yang + 2015-05-31 11:51:49 -0700 + Commit: 0674700, github.com/apache/spark/pull/6498 + + [SPARK-3850] Trim trailing spaces for MLlib. + Reynold Xin + 2015-05-31 11:35:30 -0700 + Commit: e1067d0, github.com/apache/spark/pull/6534 + + [MINOR] Add license for dagre-d3 and graphlib-dot + zsxwing + 2015-05-31 11:18:12 -0700 + Commit: d1d2def, github.com/apache/spark/pull/6539 + + [SPARK-7979] Enforce structural type checker. + Reynold Xin + 2015-05-31 01:37:56 -0700 + Commit: 4b5f12b, github.com/apache/spark/pull/6536 + + [SPARK-3850] Trim trailing spaces for SQL. + Reynold Xin + 2015-05-31 00:48:49 -0700 + Commit: 63a50be, github.com/apache/spark/pull/6535 + + [SPARK-3850] Trim trailing spaces for examples/streaming/yarn. + Reynold Xin + 2015-05-31 00:47:56 -0700 + Commit: 564bc11, github.com/apache/spark/pull/6530 + + [SPARK-3850] Trim trailing spaces for core. + Reynold Xin + 2015-05-31 00:16:22 -0700 + Commit: 74fdc97, github.com/apache/spark/pull/6533 + + [SPARK-7975] Add style checker to disallow overriding equals covariantly. + Reynold Xin + 2015-05-31 00:05:55 -0700 + Commit: 7896e99, github.com/apache/spark/pull/6527 + + [SQL] [MINOR] Adds @deprecated Scaladoc entry for SchemaRDD + Cheng Lian + 2015-05-30 23:49:42 -0700 + Commit: 8764dcc, github.com/apache/spark/pull/6529 + + [SPARK-7976] Add style checker to disallow overriding finalize. + Reynold Xin + 2015-05-30 23:36:32 -0700 + Commit: 084fef7, github.com/apache/spark/pull/6528 + + [SQL] [MINOR] Fixes a minor comment mistake in IsolatedClientLoader + Cheng Lian + 2015-05-31 12:56:41 +0800 + Commit: f7fe9e4, github.com/apache/spark/pull/6521 + + Update documentation for the new DataFrame reader/writer interface. + Reynold Xin + 2015-05-30 20:10:02 -0700 + Commit: 00a7137, github.com/apache/spark/pull/6522 + + [SPARK-7971] Add JavaDoc style deprecation for deprecated DataFrame methods + Reynold Xin + 2015-05-30 19:51:53 -0700 + Commit: c63e1a7, github.com/apache/spark/pull/6523 + + [SQL] Tighten up visibility for JavaDoc. + Reynold Xin + 2015-05-30 19:50:52 -0700 + Commit: 14b314d, github.com/apache/spark/pull/6526 + + [SPARK-5610] [DOC] update genjavadocSettings to use the patched version of genjavadoc + Xiangrui Meng + 2015-05-30 17:21:41 -0700 + Commit: 2b258e1, github.com/apache/spark/pull/6506 + + [HOTFIX] Replace FunSuite with SparkFunSuite. + Josh Rosen + 2015-05-30 16:52:34 -0700 + Commit: 66a53a6 + + [SPARK-7920] [MLLIB] Make MLlib ChiSqSelector Serializable (& Fix Related Documentation Example). + Mike Dusenberry + 2015-05-30 16:50:59 -0700 + Commit: 1281a35, github.com/apache/spark/pull/6462 + + [SPARK-7918] [MLLIB] MLlib Python doc parity check for evaluation and feature + Yanbo Liang + 2015-05-30 16:24:07 -0700 + Commit: 1617363, github.com/apache/spark/pull/6461 + + [SPARK-7855] Move bypassMergeSort-handling from ExternalSorter to own component + Josh Rosen + 2015-05-30 15:27:51 -0700 + Commit: a643002, github.com/apache/spark/pull/6397 + + Updated SQL programming guide's Hive connectivity section. + Reynold Xin + 2015-05-30 14:57:23 -0700 + Commit: 7716a5a + + [SPARK-7849] [SQL] [Docs] Updates SQL programming guide for 1.4 + Cheng Lian + 2015-05-30 12:16:09 -0700 + Commit: 6e3f0c7, github.com/apache/spark/pull/6520 + + Closes #4685 + Reynold Xin + 2015-05-30 12:06:38 -0700 + Commit: d34b43b + + [DOCS] [MINOR] Update for the Hadoop versions table with hadoop-2.6 + Taka Shinagawa + 2015-05-30 08:25:21 -0400 + Commit: 3ab71eb, github.com/apache/spark/pull/6450 + + [SPARK-7717] [WEBUI] Only showing total memory and cores for alive workers + zhichao.li + 2015-05-30 08:06:11 -0400 + Commit: 2b35c99, github.com/apache/spark/pull/6317 + + [SPARK-7945] [CORE] Do trim to values in properties file + WangTaoTheTonic , Tao Wang + 2015-05-30 08:04:27 -0400 + Commit: 9d8aadb, github.com/apache/spark/pull/6496 + + [SPARK-7890] [DOCS] Document that Spark 2.11 now supports Kafka + Sean Owen + 2015-05-30 07:59:27 -0400 + Commit: 8c8de3e, github.com/apache/spark/pull/6470 + + [SPARK-7964][SQL] remove unnecessary type coercion rule + Wenchen Fan + 2015-05-30 00:26:46 -0700 + Commit: 0978aec, github.com/apache/spark/pull/6516 + + [SPARK-7459] [MLLIB] ElementwiseProduct Java example + Octavian Geagla + 2015-05-30 00:00:36 -0700 + Commit: e3a4374, github.com/apache/spark/pull/6008 + + [SPARK-7962] [MESOS] Fix master url parsing in rest submission client. + Timothy Chen + 2015-05-29 23:56:18 -0700 + Commit: 78657d5, github.com/apache/spark/pull/6517 + + [SPARK-7576] [MLLIB] Add spark.ml user guide doc/example for ElementwiseProduct + Octavian Geagla + 2015-05-29 23:55:19 -0700 + Commit: da2112a, github.com/apache/spark/pull/6501 + + [TRIVIAL] Typo fix for last commit + Andrew Or + 2015-05-29 23:08:47 -0700 + Commit: 193dba0 + + [SPARK-7558] Guard against direct uses of FunSuite / FunSuiteLike + Andrew Or + 2015-05-29 22:57:46 -0700 + Commit: 609c492, github.com/apache/spark/pull/6510 + + [SPARK-7957] Preserve partitioning when using randomSplit + Burak Yavuz + 2015-05-29 22:19:15 -0700 + Commit: 7ed06c3, github.com/apache/spark/pull/6509 + + [DOCS][Tiny] Added a missing dash(-) in docs/configuration.md + Taka Shinagawa + 2015-05-29 20:35:14 -0700 + Commit: 3792d25, github.com/apache/spark/pull/6513 + + [HOT FIX] [BUILD] Fix maven build failures + Andrew Or + 2015-05-29 17:19:46 -0700 + Commit: a4f2412, github.com/apache/spark/pull/6511 + + [HOTFIX] [SQL] Maven test compilation issue + Andrew Or + 2015-05-29 15:26:49 -0700 + Commit: 8c99793 + + [SPARK-6013] [ML] Add more Python ML examples for spark.ml + Ram Sriharsha + 2015-05-29 15:22:26 -0700 + Commit: dbf8ff3, github.com/apache/spark/pull/6443 + + [SPARK-7954] [SPARKR] Create SparkContext in sparkRSQL init + Shivaram Venkataraman + 2015-05-29 15:08:30 -0700 + Commit: 5fb97dc, github.com/apache/spark/pull/6507 + + [SPARK-7910] [TINY] [JAVAAPI] expose partitioner information in javardd + Holden Karau + 2015-05-29 14:59:18 -0700 + Commit: 82a396c, github.com/apache/spark/pull/6464 + + [SPARK-7899] [PYSPARK] Fix Python 3 pyspark/sql/types module conflict + Michael Nazario + 2015-05-29 14:13:44 -0700 + Commit: 1c5b198, github.com/apache/spark/pull/6439 + + [SPARK-6806] [SPARKR] [DOCS] Add a new SparkR programming guide + Shivaram Venkataraman + 2015-05-29 14:11:58 -0700 + Commit: 5f48e5c, github.com/apache/spark/pull/6490 + + [SPARK-7558] Demarcate tests in unit-tests.log + Andrew Or + 2015-05-29 14:03:12 -0700 + Commit: 9eb222c, github.com/apache/spark/pull/6441 + + [SPARK-7940] Enforce whitespace checking for DO, TRY, CATCH, FINALLY, MATCH, LARROW, RARROW in style checker. + Reynold Xin + 2015-05-29 13:38:37 -0700 + Commit: 94f62a4, github.com/apache/spark/pull/6491 + + [SPARK-7946] [MLLIB] DecayFactor wrongly set in StreamingKMeans + MechCoder + 2015-05-29 11:36:41 -0700 + Commit: 6181937, github.com/apache/spark/pull/6497 + + [SQL] [TEST] [MINOR] Uses a temporary log4j.properties in HiveThriftServer2Test to ensure expected logging behavior + Cheng Lian + 2015-05-29 11:11:40 -0700 + Commit: 4782e13, github.com/apache/spark/pull/6493 + + [SPARK-7950] [SQL] Sets spark.sql.hive.version in HiveThriftServer2.startWithContext() + Cheng Lian + 2015-05-29 10:43:34 -0700 + Commit: e7b6177, github.com/apache/spark/pull/6500 + + [SPARK-7524] [SPARK-7846] add configs for keytab and principal, pass these two configs with different way in different modes + WangTaoTheTonic + 2015-05-29 11:06:11 -0500 + Commit: a51b133, github.com/apache/spark/pull/6051 + + [SPARK-7863] [CORE] Create SimpleDateFormat for every SimpleDateParam instance because it's not thread-safe + zsxwing + 2015-05-29 05:17:41 -0400 + Commit: 8db40f6, github.com/apache/spark/pull/6406 + + [SPARK-7756] [CORE] Use testing cipher suites common to Oracle and IBM security providers + Tim Ellison + 2015-05-29 05:14:43 -0400 + Commit: bf46580, github.com/apache/spark/pull/6282 + + [SPARK-7912] [SPARK-7921] [MLLIB] Update OneHotEncoder to handle ML attributes and change includeFirst to dropLast + Xiangrui Meng + 2015-05-29 00:51:12 -0700 + Commit: 23452be, github.com/apache/spark/pull/6466 + + [SPARK-7929] Turn whitespace checker on for more token types. + Reynold Xin + 2015-05-28 23:00:02 -0700 + Commit: 97a60cf, github.com/apache/spark/pull/6487 + + [HOTFIX] Minor style fix from last commit + Patrick Wendell + 2015-05-28 22:48:02 -0700 + Commit: 36067ce + + [SPARK-7931] [STREAMING] Do not restart receiver when stopped + Tathagata Das + 2015-05-28 22:39:21 -0700 + Commit: e714ecf, github.com/apache/spark/pull/6483 + + [SPARK-7922] [MLLIB] use DataFrames for user/item factors in ALSModel + Xiangrui Meng + 2015-05-28 22:38:38 -0700 + Commit: db95137, github.com/apache/spark/pull/6468 + + [SPARK-7930] [CORE] [STREAMING] Fixed shutdown hook priorities + Tathagata Das + 2015-05-28 22:28:13 -0700 + Commit: cd3d9a5, github.com/apache/spark/pull/6482 + + [SPARK-7932] Fix misleading scheduler delay visualization + Kay Ousterhout + 2015-05-28 22:09:49 -0700 + Commit: 04ddcd4, github.com/apache/spark/pull/6484 + + [MINOR] fix RegressionEvaluator doc + Xiangrui Meng + 2015-05-28 21:26:43 -0700 + Commit: 834e699, github.com/apache/spark/pull/6469 + + [SPARK-7926] [PYSPARK] use the official Pyrolite release + Xiangrui Meng + 2015-05-28 21:20:54 -0700 + Commit: c45d58c, github.com/apache/spark/pull/6472 + + [SPARK-7927] whitespace fixes for GraphX. + Reynold Xin + 2015-05-28 20:17:16 -0700 + Commit: b069ad2, github.com/apache/spark/pull/6474 + + [SPARK-7927] whitespace fixes for core. + Reynold Xin + 2015-05-28 20:15:52 -0700 + Commit: 7f7505d, github.com/apache/spark/pull/6473 + + [SPARK-7927] whitespace fixes for Catalyst module. + Reynold Xin + 2015-05-28 20:11:57 -0700 + Commit: 8da560d, github.com/apache/spark/pull/6476 + + [SPARK-7929] Remove Bagel examples & whitespace fix for examples. + Reynold Xin + 2015-05-28 20:11:04 -0700 + Commit: 2881d14, github.com/apache/spark/pull/6480 + + [SPARK-7927] whitespace fixes for SQL core. + Reynold Xin + 2015-05-28 20:10:21 -0700 + Commit: ff44c71, github.com/apache/spark/pull/6477 + + [SPARK-7927] [MLLIB] Enforce whitespace for more tokens in style checker + Xiangrui Meng + 2015-05-28 20:09:12 -0700 + Commit: 04616b1, github.com/apache/spark/pull/6481 + + [SPARK-7826] [CORE] Suppress extra calling getCacheLocs. + Takuya UESHIN + 2015-05-28 19:05:12 -0700 + Commit: 9b692bf, github.com/apache/spark/pull/6352 + + [SPARK-7933] Remove Patrick's username/pw from merge script + Kay Ousterhout + 2015-05-28 19:04:32 -0700 + Commit: 66c49ed, github.com/apache/spark/pull/6485 + + [SPARK-7927] whitespace fixes for Hive and ThriftServer. + Reynold Xin + 2015-05-28 18:08:56 -0700 + Commit: ee6a0e1, github.com/apache/spark/pull/6478 + + [SPARK-7927] whitespace fixes for streaming. + Reynold Xin + 2015-05-28 17:55:22 -0700 + Commit: 3af0b31, github.com/apache/spark/pull/6475 + + [SPARK-7577] [ML] [DOC] add bucketizer doc + Xusen Yin + 2015-05-28 17:30:12 -0700 + Commit: 1bd63e8, github.com/apache/spark/pull/6451 + + [SPARK-7853] [SQL] Fix HiveContext in Spark Shell + Yin Huai + 2015-05-28 17:12:30 -0700 + Commit: 572b62c, github.com/apache/spark/pull/6459 + + Remove SizeEstimator from o.a.spark package. + Reynold Xin + 2015-05-28 16:56:59 -0700 + Commit: 0077af2, github.com/apache/spark/pull/6471 + + [SPARK-7198] [MLLIB] VectorAssembler should output ML attributes + Xiangrui Meng + 2015-05-28 16:32:51 -0700 + Commit: 7859ab6, github.com/apache/spark/pull/6452 + + [DOCS] Fixing broken "IDE setup" link in the Building Spark documentation. + Mike Dusenberry + 2015-05-28 17:15:10 -0400 + Commit: 3e312a5, github.com/apache/spark/pull/6467 + + [MINOR] Fix the a minor bug in PageRank Example. + Li Yao + 2015-05-28 13:39:39 -0700 + Commit: c771589, github.com/apache/spark/pull/6455 + + [SPARK-7911] [MLLIB] A workaround for VectorUDT serialize (or deserialize) being called multiple times + Xiangrui Meng + 2015-05-28 12:03:46 -0700 + Commit: 530efe3, github.com/apache/spark/pull/6442 + + [SPARK-7895] [STREAMING] [EXAMPLES] Move Kafka examples from scala-2.10/src to src + zsxwing + 2015-05-28 09:04:12 -0700 + Commit: 000df2f, github.com/apache/spark/pull/6436 + + [SPARK-7782] fixed sort arrow issue + zuxqoj + 2015-05-27 23:13:13 -0700 + Commit: e838a25, github.com/apache/spark/pull/6437 + + [DOCS] Fix typo in documentation for Java UDF registration + Matt Wise + 2015-05-27 22:39:19 -0700 + Commit: 3541061, github.com/apache/spark/pull/6447 + + [SPARK-7896] Allow ChainedBuffer to store more than 2 GB + Sandy Ryza + 2015-05-27 22:23:22 -0700 + Commit: bd11b01, github.com/apache/spark/pull/6440 + + [SPARK-7873] Allow KryoSerializerInstance to create multiple streams at the same time + Josh Rosen + 2015-05-27 20:19:53 -0700 + Commit: 852f4de, github.com/apache/spark/pull/6415 + + [SPARK-7907] [SQL] [UI] Rename tab ThriftServer to SQL. + Yin Huai + 2015-05-27 20:04:29 -0700 + Commit: 3c1f1ba, github.com/apache/spark/pull/6448 + + [SPARK-7897][SQL] Use DecimalType to represent unsigned bigint in JDBCRDD + Liang-Chi Hsieh + 2015-05-27 18:51:36 -0700 + Commit: a1e092e, github.com/apache/spark/pull/6438 + + [SPARK-7853] [SQL] Fixes a class loader issue in Spark SQL + Cheng Hao , Cheng Lian , Yin Huai + 2015-05-27 14:21:00 -0700 + Commit: db3fd05, github.com/apache/spark/pull/6435 + + [SPARK-7684] [SQL] Refactoring MetastoreDataSourcesSuite to workaround SPARK-7684 + Cheng Lian , Yin Huai + 2015-05-27 13:09:33 -0700 + Commit: b97ddff, github.com/apache/spark/pull/6353 + + [SPARK-7790] [SQL] date and decimal conversion for dynamic partition key + Daoyuan Wang + 2015-05-27 12:42:13 -0700 + Commit: 8161562, github.com/apache/spark/pull/6318 + + Removed Guava dependency from JavaTypeInference's type signature. + Reynold Xin + 2015-05-27 11:54:35 -0700 + Commit: 6fec1a9, github.com/apache/spark/pull/6431 + + [SPARK-7864] [UI] Fix the logic grabbing the link from table in AllJobPage + Kousuke Saruta + 2015-05-27 11:41:35 -0700 + Commit: 0db76c9, github.com/apache/spark/pull/6432 + + [SPARK-7847] [SQL] Fixes dynamic partition directory escaping + Cheng Lian + 2015-05-27 10:09:12 -0700 + Commit: 15459db, github.com/apache/spark/pull/6389 + + [SPARK-7878] Rename Stage.jobId to firstJobId + Kay Ousterhout + 2015-05-27 09:32:29 -0700 + Commit: ff0ddff, github.com/apache/spark/pull/6418 + + [CORE] [TEST] HistoryServerSuite failed due to timezone issue + scwf + 2015-05-27 09:12:18 -0500 + Commit: 4615081, github.com/apache/spark/pull/6425 + + [SQL] Rename MathematicalExpression UnaryMathExpression, and specify BinaryMathExpression's output data type as DoubleType. + Reynold Xin + 2015-05-27 01:13:57 -0700 + Commit: 3e7d7d6, github.com/apache/spark/pull/6428 + + [SPARK-7887][SQL] Remove EvaluatedType from SQL Expression. + Reynold Xin + 2015-05-27 01:12:59 -0700 + Commit: 9f48bf6, github.com/apache/spark/pull/6427 + + [SPARK-7697][SQL] Use LongType for unsigned int in JDBCRDD + Liang-Chi Hsieh + 2015-05-27 00:27:39 -0700 + Commit: 4f98d7a, github.com/apache/spark/pull/6229 + + [SPARK-7850][BUILD] Hive 0.12.0 profile in POM should be removed + Cheolsoo Park + 2015-05-27 00:18:42 -0700 + Commit: 6dd6458, github.com/apache/spark/pull/6393 + + [SPARK-7535] [.1] [MLLIB] minor changes to the pipeline API + Xiangrui Meng + 2015-05-26 23:51:32 -0700 + Commit: a9f1c0c, github.com/apache/spark/pull/6392 + + [SPARK-7868] [SQL] Ignores _temporary directories in HadoopFsRelation + Cheng Lian + 2015-05-26 20:48:56 -0700 + Commit: b463e6d, github.com/apache/spark/pull/6411 + + [SPARK-7858] [SQL] Use output schema, not relation schema, for data source input conversion + Josh Rosen , Cheng Lian , Cheng Lian + 2015-05-26 20:24:35 -0700 + Commit: 0c33c7b, github.com/apache/spark/pull/5986. + + [SPARK-7637] [SQL] O(N) merge implementation for StructType merge + rowan + 2015-05-26 18:17:16 -0700 + Commit: 0366834, github.com/apache/spark/pull/6259 + + [SPARK-7883] [DOCS] [MLLIB] Fixing broken trainImplicit Scala example in MLlib Collaborative Filtering documentation. + Mike Dusenberry + 2015-05-26 18:08:57 -0700 + Commit: 0463428, github.com/apache/spark/pull/6422 + + [SPARK-7864] [UI] Do not kill innocent stages from visualization + Andrew Or + 2015-05-26 16:31:34 -0700 + Commit: 8f20824, github.com/apache/spark/pull/6419 + + [SPARK-7748] [MLLIB] Graduate spark.ml from alpha + Xiangrui Meng + 2015-05-26 15:51:31 -0700 + Commit: 836a758, github.com/apache/spark/pull/6417 + + [SPARK-6602] [CORE] Remove some places in core that calling SparkEnv.actorSystem + zsxwing + 2015-05-26 15:28:49 -0700 + Commit: 9f74224, github.com/apache/spark/pull/6333 + + [SPARK-3674] YARN support in Spark EC2 + Shivaram Venkataraman + 2015-05-26 15:01:27 -0700 + Commit: 2e9a5f2, github.com/apache/spark/pull/6376 + + [SPARK-7844] [MLLIB] Fix broken tests in KernelDensity + MechCoder + 2015-05-26 13:21:00 -0700 + Commit: 6166473, github.com/apache/spark/pull/6383 + + Revert "[SPARK-7042] [BUILD] use the standard akka artifacts with hadoop-2.x" + Patrick Wendell + 2015-05-26 10:05:13 -0700 + Commit: b7d8085 + + [SPARK-7854] [TEST] refine Kryo test suite + Zhang, Liye + 2015-05-26 17:08:16 +0100 + Commit: 6309912, github.com/apache/spark/pull/6395 + + [DOCS] [MLLIB] Fixing misformatted links in v1.4 MLlib Naive Bayes documentation by removing space and newline characters. + Mike Dusenberry + 2015-05-26 17:05:58 +0100 + Commit: e5a63a0, github.com/apache/spark/pull/6412 + + [SPARK-7806][EC2] Fixes that allow the spark_ec2.py tool to run with Python3 + meawoppl + 2015-05-26 09:02:25 -0700 + Commit: 8dbe777, github.com/apache/spark/pull/6336 + + [SPARK-7339] [PYSPARK] PySpark shuffle spill memory sometimes are not correct + linweizhong + 2015-05-26 08:35:39 -0700 + Commit: 8948ad3, github.com/apache/spark/pull/5887 + + [CORE] [TEST] Fix SimpleDateParamTest + scwf , Fei Wang + 2015-05-26 08:42:52 -0500 + Commit: bf49c22, github.com/apache/spark/pull/6377 + + [SPARK-7042] [BUILD] use the standard akka artifacts with hadoop-2.x + Konstantin Shaposhnikov + 2015-05-26 07:49:32 +0100 + Commit: 43aa819, github.com/apache/spark/pull/6341 + + [SQL][minor] Removed unused Catalyst logical plan DSL. + Reynold Xin + 2015-05-25 23:09:22 -0700 + Commit: c9adcad, github.com/apache/spark/pull/6350 + + [SPARK-7832] [Build] Always run SQL tests in master build. + Yin Huai + 2015-05-25 18:23:58 -0700 + Commit: f38e619, github.com/apache/spark/pull/6385 + + [SPARK-6391][DOCS] Document Tachyon compatibility. + Calvin Jia + 2015-05-25 16:50:43 -0700 + Commit: ce0051d, github.com/apache/spark/pull/6382 + + [SPARK-7842] [SQL] Makes task committing/aborting in InsertIntoHadoopFsRelation more robust + Cheng Lian + 2015-05-26 00:28:47 +0800 + Commit: 8af1bf1, github.com/apache/spark/pull/6378 + + [SPARK-7684] [SQL] Invoking HiveContext.newTemporaryConfiguration() shouldn't create new metastore directory + Cheng Lian + 2015-05-26 00:16:06 +0800 + Commit: bfeedc6, github.com/apache/spark/pull/6359 + + Add test which shows Kryo buffer size configured in mb is properly supported + tedyu + 2015-05-25 08:20:31 +0100 + Commit: fd31fd4, github.com/apache/spark/pull/6390 + + Close HBaseAdmin at the end of HBaseTest + tedyu + 2015-05-25 08:19:42 +0100 + Commit: 23bea97, github.com/apache/spark/pull/6381 + + [SPARK-7811] Fix typo on slf4j configuration on metrics.properties.tem… + Judy Nash + 2015-05-24 21:48:27 +0100 + Commit: 4f4ba8f, github.com/apache/spark/pull/6362 + + [SPARK-7833] [ML] Add python wrapper for RegressionEvaluator + Ram Sriharsha + 2015-05-24 10:36:02 -0700 + Commit: 65c696e, github.com/apache/spark/pull/6365 + + [SPARK-7805] [SQL] Move SQLTestUtils.scala and ParquetTest.scala to src/test + Yin Huai + 2015-05-24 09:51:37 -0700 + Commit: ed21476, github.com/apache/spark/pull/6334 + + [SPARK-7845] [BUILD] Bump "Hadoop 1" tests to version 1.2.1 + Yin Huai + 2015-05-24 09:49:57 -0700 + Commit: bfbc0df, github.com/apache/spark/pull/6384 + + [SPARK-7287] [HOTFIX] Disable o.a.s.deploy.SparkSubmitSuite --packages + Patrick Wendell + 2015-05-23 19:44:03 -0700 + Commit: 3c1a2d0 + + [HOTFIX] Copy SparkR lib if it exists in make-distribution + Shivaram Venkataraman + 2015-05-23 12:28:16 -0700 + Commit: b231baa, github.com/apache/spark/pull/6379 + + [SPARK-7654] [SQL] Move insertInto into reader/writer interface. + Yin Huai , Reynold Xin + 2015-05-23 09:48:20 -0700 + Commit: 2b7e635, github.com/apache/spark/pull/6366 + + Fix install jira-python + Davies Liu + 2015-05-23 09:14:07 -0700 + Commit: a4df0f2, github.com/apache/spark/pull/6367 + + [SPARK-7840] add insertInto() to Writer + Davies Liu + 2015-05-23 09:07:14 -0700 + Commit: be47af1, github.com/apache/spark/pull/6375 + + [SPARK-7322, SPARK-7836, SPARK-7822][SQL] DataFrame window function related updates + Davies Liu , Reynold Xin + 2015-05-23 08:30:05 -0700 + Commit: efe3bfd, github.com/apache/spark/pull/6374 + + [SPARK-7777][Streaming] Handle the case when there is no block in a batch + zsxwing + 2015-05-23 02:11:17 -0700 + Commit: ad0badb, github.com/apache/spark/pull/6372 + + [SPARK-6811] Copy SparkR lib in make-distribution.sh + Shivaram Venkataraman + 2015-05-23 00:04:01 -0700 + Commit: a40bca0, github.com/apache/spark/pull/6373 + + [SPARK-6806] [SPARKR] [DOCS] Fill in SparkR examples in programming guide + Davies Liu + 2015-05-23 00:00:30 -0700 + Commit: 7af3818, github.com/apache/spark/pull/5442 + + [SPARK-5090] [EXAMPLES] The improvement of python converter for hbase + GenTang + 2015-05-22 23:37:03 -0700 + Commit: 4583cf4, github.com/apache/spark/pull/3920 + + [HOTFIX] Add tests for SparkListenerApplicationStart with Driver Logs. + Hari Shreedharan + 2015-05-22 23:07:56 -0700 + Commit: 368b8c2, github.com/apache/spark/pull/6368 + + [SPARK-7838] [STREAMING] Set scope for kinesis stream + Tathagata Das + 2015-05-22 23:05:54 -0700 + Commit: baa8983, github.com/apache/spark/pull/6369 + + [MINOR] Add SparkR to create-release script + Shivaram Venkataraman + 2015-05-22 22:33:49 -0700 + Commit: 017b340, github.com/apache/spark/pull/6371 + + [SPARK-7795] [CORE] Speed up task scheduling in standalone mode by reusing serializer + Akshat Aranya + 2015-05-22 22:03:31 -0700 + Commit: a163574, github.com/apache/spark/pull/6323 + + [SPARK-7830] [DOCS] [MLLIB] Adding logistic regression to the list of Multiclass Classification Supported Methods documentation + Mike Dusenberry + 2015-05-22 18:03:12 -0700 + Commit: 63a5ce7, github.com/apache/spark/pull/6357 + + [SPARK-7224] [SPARK-7306] mock repository generator for --packages tests without nio.Path + Burak Yavuz + 2015-05-22 17:48:09 -0700 + Commit: 8014e1f, github.com/apache/spark/pull/5892 + + [SPARK-7788] Made KinesisReceiver.onStart() non-blocking + Tathagata Das + 2015-05-22 17:39:01 -0700 + Commit: 1c388a9, github.com/apache/spark/pull/6348 + + [SPARK-7771] [SPARK-7779] Dynamic allocation: lower default timeouts further + Andrew Or + 2015-05-22 17:37:38 -0700 + Commit: 3d8760d, github.com/apache/spark/pull/6301 + + [SPARK-7834] [SQL] Better window error messages + Michael Armbrust + 2015-05-22 17:23:12 -0700 + Commit: 3c13051, github.com/apache/spark/pull/6363 + + [SPARK-7760] add /json back into master & worker pages; add test + Imran Rashid + 2015-05-22 16:05:07 -0700 + Commit: 821254f, github.com/apache/spark/pull/6284 + + [SPARK-7270] [SQL] Consider dynamic partition when inserting into hive table + Liang-Chi Hsieh + 2015-05-22 15:39:58 -0700 + Commit: 126d723, github.com/apache/spark/pull/5864 + + [SPARK-7724] [SQL] Support Intersect/Except in Catalyst DSL. + Santiago M. Mola + 2015-05-22 15:10:27 -0700 + Commit: e4aef91, github.com/apache/spark/pull/6327 + + [SPARK-7758] [SQL] Override more configs to avoid failure when connect to a postgre sql + WangTaoTheTonic + 2015-05-22 14:43:16 -0700 + Commit: 31d5d46, github.com/apache/spark/pull/6314 + + [SPARK-7766] KryoSerializerInstance reuse is unsafe when auto-reset is disabled + Josh Rosen + 2015-05-22 13:28:14 -0700 + Commit: eac0069, github.com/apache/spark/pull/6293 + + [SPARK-7574] [ML] [DOC] User guide for OneVsRest + Ram Sriharsha + 2015-05-22 13:18:08 -0700 + Commit: 509d55a, github.com/apache/spark/pull/6296 + + Revert "[BUILD] Always run SQL tests in master build." + Patrick Wendell + 2015-05-22 10:04:45 -0700 + Commit: c63036c + + [SPARK-7404] [ML] Add RegressionEvaluator to spark.ml + Ram Sriharsha + 2015-05-22 09:59:44 -0700 + Commit: f490b3b, github.com/apache/spark/pull/6344 + + [SPARK-6743] [SQL] Fix empty projections of cached data + Michael Armbrust + 2015-05-22 09:43:46 -0700 + Commit: 3b68cb0, github.com/apache/spark/pull/6165 + + [MINOR] [SQL] Ignores Thrift server UISeleniumSuite + Cheng Lian + 2015-05-22 16:25:52 +0800 + Commit: 4e5220c, github.com/apache/spark/pull/6345 + + [SPARK-7322][SQL] Window functions in DataFrame + Cheng Hao , Reynold Xin + 2015-05-22 01:00:16 -0700 + Commit: f6f2eeb, github.com/apache/spark/pull/6343 + + [SPARK-7578] [ML] [DOC] User guide for spark.ml Normalizer, IDF, StandardScaler + Joseph K. Bradley + 2015-05-21 22:59:45 -0700 + Commit: 2728c3d, github.com/apache/spark/pull/6127 + + [SPARK-7535] [.0] [MLLIB] Audit the pipeline APIs for 1.4 + Xiangrui Meng + 2015-05-21 22:57:33 -0700 + Commit: 8f11c61, github.com/apache/spark/pull/6322 + + [DOCS] [MLLIB] Fixing broken link in MLlib Linear Methods documentation. + Mike Dusenberry + 2015-05-21 19:05:04 -0700 + Commit: e4136ea, github.com/apache/spark/pull/6340 + + [SPARK-7657] [YARN] Add driver logs links in application UI, in cluster mode. + Hari Shreedharan + 2015-05-21 20:24:28 -0500 + Commit: 956c4c9, github.com/apache/spark/pull/6166 + + [SPARK-7219] [MLLIB] Output feature attributes in HashingTF + Xiangrui Meng + 2015-05-21 18:04:45 -0700 + Commit: 85b9637, github.com/apache/spark/pull/6308 + + [SPARK-7794] [MLLIB] update RegexTokenizer default settings + Xiangrui Meng + 2015-05-21 17:59:03 -0700 + Commit: f5db4b4, github.com/apache/spark/pull/6330 + + [SPARK-7783] [SQL] [PySpark] add DataFrame.rollup/cube in Python + Davies Liu + 2015-05-21 17:43:08 -0700 + Commit: 17791a5, github.com/apache/spark/pull/6311 + + [SPARK-7776] [STREAMING] Added shutdown hook to StreamingContext + Tathagata Das + 2015-05-21 17:41:31 -0700 + Commit: d68ea24, github.com/apache/spark/pull/6307 + + [SPARK-7737] [SQL] Use leaf dirs having data files to discover partitions. + Yin Huai + 2015-05-22 07:10:26 +0800 + Commit: 347b501, github.com/apache/spark/pull/6329 + + [BUILD] Always run SQL tests in master build. + Yin Huai + 2015-05-21 15:40:58 -0700 + Commit: 147b6be, github.com/apache/spark/pull/5955 + + [SPARK-7800] isDefined should not marked too early in putNewKey + Liang-Chi Hsieh + 2015-05-21 23:12:00 +0100 + Commit: 5a3c04b, github.com/apache/spark/pull/6324 + + [SPARK-7718] [SQL] Speed up partitioning by avoiding closure cleaning + Andrew Or + 2015-05-21 14:33:11 -0700 + Commit: 5287eec, github.com/apache/spark/pull/6256 + + [SPARK-7711] Add a startTime property to match the corresponding one in Scala + Holden Karau + 2015-05-21 14:08:57 -0700 + Commit: 6b18cdc, github.com/apache/spark/pull/6275 + + [SPARK-7478] [SQL] Added SQLContext.getOrCreate + Tathagata Das + 2015-05-21 14:08:20 -0700 + Commit: 3d0cccc, github.com/apache/spark/pull/6006 + + [SPARK-7763] [SPARK-7616] [SQL] Persists partition columns into metastore + Yin Huai , Cheng Lian + 2015-05-21 13:51:40 -0700 + Commit: 30f3f55, github.com/apache/spark/pull/6285 + + [SPARK-7722] [STREAMING] Added Kinesis to style checker + Tathagata Das + 2015-05-21 13:50:08 -0700 + Commit: 311fab6, github.com/apache/spark/pull/6325 + + [SPARK-7498] [MLLIB] add varargs back to setDefault + Xiangrui Meng + 2015-05-21 13:06:53 -0700 + Commit: cdc7c05, github.com/apache/spark/pull/6320 + + [SPARK-7585] [ML] [DOC] VectorIndexer user guide section + Joseph K. Bradley + 2015-05-21 13:05:48 -0700 + Commit: 6d75ed7, github.com/apache/spark/pull/6255 + + [SPARK-7775] YARN AM negative sleep exception + Andrew Or + 2015-05-21 20:34:20 +0100 + Commit: 15680ae, github.com/apache/spark/pull/6305 + + [SQL] [TEST] udf_java_method failed due to jdk version + scwf + 2015-05-21 12:31:58 -0700 + Commit: f6c486a, github.com/apache/spark/pull/6274 + + [SPARK-7793] [MLLIB] Use getOrElse for getting the threshold of SVM model + Shuo Xiang + 2015-05-21 12:09:44 -0700 + Commit: 4f57200, github.com/apache/spark/pull/6321 + + [SPARK-7394][SQL] Add Pandas style cast (astype) + kaka1992 + 2015-05-21 11:50:39 -0700 + Commit: 699906e, github.com/apache/spark/pull/6313 + + [SPARK-6416] [DOCS] RDD.fold() requires the operator to be commutative + Sean Owen + 2015-05-21 19:42:51 +0100 + Commit: 6e53402, github.com/apache/spark/pull/6231 + + [SPARK-7787] [STREAMING] Fix serialization issue of SerializableAWSCredentials + Tathagata Das + 2015-05-21 11:39:32 -0700 + Commit: 4b7ff30, github.com/apache/spark/pull/6316 + + [SPARK-7749] [SQL] Fixes partition discovery for non-partitioned tables + Cheng Lian , Yin Huai + 2015-05-21 10:56:17 -0700 + Commit: 8730fbb, github.com/apache/spark/pull/6287 + + [SPARK-7752] [MLLIB] Use lowercase letters for NaiveBayes.modelType + Xiangrui Meng + 2015-05-21 10:30:08 -0700 + Commit: 13348e2, github.com/apache/spark/pull/6277 + + [SPARK-7565] [SQL] fix MapType in JsonRDD + Davies Liu + 2015-05-21 09:58:47 -0700 + Commit: a25c1ab, github.com/apache/spark/pull/6084 + + [SPARK-7320] [SQL] [Minor] Move the testData into beforeAll() + Cheng Hao + 2015-05-21 09:28:00 -0700 + Commit: feb3a9d, github.com/apache/spark/pull/6312 + + [SPARK-7745] Change asserts to requires for user input checks in Spark Streaming + Burak Yavuz + 2015-05-21 00:30:55 -0700 + Commit: 1ee8eb4, github.com/apache/spark/pull/6271 + + [SPARK-7753] [MLLIB] Update KernelDensity API + Xiangrui Meng + 2015-05-20 23:38:58 -0700 + Commit: 947ea1c, github.com/apache/spark/pull/6279 + + [SPARK-7606] [SQL] [PySpark] add version to Python SQL API docs + Davies Liu + 2015-05-20 23:05:54 -0700 + Commit: 8ddcb25, github.com/apache/spark/pull/6295 + + [SPARK-7389] [CORE] Tachyon integration improvement + Mingfei + 2015-05-20 22:33:03 -0700 + Commit: 04940c4, github.com/apache/spark/pull/5908 + + [SPARK-7746][SQL] Add FetchSize parameter for JDBC driver + Liang-Chi Hsieh + 2015-05-20 22:23:49 -0700 + Commit: d0eb9ff, github.com/apache/spark/pull/6283 + + [SPARK-7774] [MLLIB] add sqlContext to MLlibTestSparkContext + Xiangrui Meng + 2015-05-20 20:30:39 -0700 + Commit: ddec173, github.com/apache/spark/pull/6303 + + [SPARK-7320] [SQL] Add Cube / Rollup for dataframe + Cheng Hao + 2015-05-20 19:58:22 -0700 + Commit: 42c592a, github.com/apache/spark/pull/6304 + + [SPARK-7777] [STREAMING] Fix the flaky test in org.apache.spark.streaming.BasicOperationsSuite + zsxwing + 2015-05-20 19:56:01 -0700 + Commit: 895baf8, github.com/apache/spark/pull/6306 + + [SPARK-7750] [WEBUI] Rename endpoints from `json` to `api` to allow fu… + Hari Shreedharan + 2015-05-20 21:13:10 -0500 + Commit: a70bf06, github.com/apache/spark/pull/6273 + + [SPARK-7719] Re-add UnsafeShuffleWriterSuite test that was removed for Java 6 compat + Josh Rosen + 2015-05-20 17:52:50 -0700 + Commit: 5196eff, github.com/apache/spark/pull/6298 + + [SPARK-7762] [MLLIB] set default value for outputCol + Xiangrui Meng + 2015-05-20 17:26:26 -0700 + Commit: c330e52, github.com/apache/spark/pull/6289 + + [SPARK-7251] Perform sequential scan when iterating over BytesToBytesMap + Josh Rosen + 2015-05-20 16:42:49 -0700 + Commit: f2faa7a, github.com/apache/spark/pull/6159 + + [SPARK-7698] Cache and reuse buffers in ExecutorMemoryAllocator when using heap allocation + Josh Rosen + 2015-05-20 16:37:11 -0700 + Commit: 7956dd7, github.com/apache/spark/pull/6227 + + [SPARK-7767] [STREAMING] Added test for checkpoint serialization in StreamingContext.start() + Tathagata Das + 2015-05-20 16:21:23 -0700 + Commit: 3c434cb, github.com/apache/spark/pull/6292 + + [SPARK-7237] [SPARK-7741] [CORE] [STREAMING] Clean more closures that need cleaning + Andrew Or + 2015-05-20 15:39:32 -0700 + Commit: 9b84443, github.com/apache/spark/pull/6269 + + [SPARK-7511] [MLLIB] pyspark ml seed param should be random by default or 42 is quite funny but not very random + Holden Karau + 2015-05-20 15:16:12 -0700 + Commit: 191ee47, github.com/apache/spark/pull/6139 + + Revert "[SPARK-7320] [SQL] Add Cube / Rollup for dataframe" + Patrick Wendell + 2015-05-20 13:39:04 -0700 + Commit: 6338c40 + + [SPARK-7579] [ML] [DOC] User guide update for OneHotEncoder + Sandy Ryza + 2015-05-20 13:10:30 -0700 + Commit: 829f1d9, github.com/apache/spark/pull/6126 + + [SPARK-7537] [MLLIB] spark.mllib API updates + Xiangrui Meng + 2015-05-20 12:50:06 -0700 + Commit: 2ad4837, github.com/apache/spark/pull/6280 + + [SPARK-7713] [SQL] Use shared broadcast hadoop conf for partitioned table scan. + Yin Huai + 2015-05-20 11:23:40 -0700 + Commit: b631bf7, github.com/apache/spark/pull/6252 + + [SPARK-6094] [MLLIB] Add MultilabelMetrics in PySpark/MLlib + Yanbo Liang + 2015-05-20 07:55:51 -0700 + Commit: 98a46f9, github.com/apache/spark/pull/6276 + + [SPARK-7654] [MLLIB] Migrate MLlib to the DataFrame reader/writer API + Xiangrui Meng + 2015-05-20 07:46:17 -0700 + Commit: 589b12f, github.com/apache/spark/pull/6281 + + [SPARK-7533] [YARN] Decrease spacing between AM-RM heartbeats. + ehnalis + 2015-05-20 08:27:39 -0500 + Commit: 3ddf051, github.com/apache/spark/pull/6082 + + [SPARK-7320] [SQL] Add Cube / Rollup for dataframe + Cheng Hao + 2015-05-20 19:09:47 +0800 + Commit: 09265ad, github.com/apache/spark/pull/6257 + + [SPARK-7663] [MLLIB] Add requirement for word2vec model + Xusen Yin + 2015-05-20 10:41:18 +0100 + Commit: b3abf0b, github.com/apache/spark/pull/6228 + + [SPARK-7656] [SQL] use CatalystConf in FunctionRegistry + scwf + 2015-05-19 17:36:00 -0700 + Commit: 60336e3, github.com/apache/spark/pull/6164 + + [SPARK-7744] [DOCS] [MLLIB] Distributed matrix" section in MLlib "Data Types" documentation should be reordered. + Mike Dusenberry + 2015-05-19 17:18:08 -0700 + Commit: 3860520, github.com/apache/spark/pull/6270 + + [SPARK-6246] [EC2] fixed support for more than 100 nodes + alyaxey + 2015-05-19 16:45:52 -0700 + Commit: 2bc5e06, github.com/apache/spark/pull/6267 + + [SPARK-7662] [SQL] Resolve correct names for generator in projection + Cheng Hao + 2015-05-19 15:20:46 -0700 + Commit: bcb1ff8, github.com/apache/spark/pull/6178 + + [SPARK-7738] [SQL] [PySpark] add reader and writer API in Python + Davies Liu + 2015-05-19 14:23:28 -0700 + Commit: 4de74d2, github.com/apache/spark/pull/6238 + + [SPARK-7652] [MLLIB] Update the implementation of naive Bayes prediction with BLAS + Liang-Chi Hsieh + 2015-05-19 13:53:08 -0700 + Commit: c12dff9, github.com/apache/spark/pull/6189 + + [SPARK-7586] [ML] [DOC] Add docs of Word2Vec in ml package + Xusen Yin + 2015-05-19 13:43:48 -0700 + Commit: 68fb2a4, github.com/apache/spark/pull/6181 + + [SPARK-7726] Fix Scaladoc false errors + Iulian Dragos + 2015-05-19 12:14:48 -0700 + Commit: 3c4c1f9, github.com/apache/spark/pull/6260 + + [SPARK-7678] [ML] Fix default random seed in HasSeed + Joseph K. Bradley + 2015-05-19 10:57:47 -0700 + Commit: 7b16e9f, github.com/apache/spark/pull/6251 + + [SPARK-7047] [ML] ml.Model optional parent support + Joseph K. Bradley + 2015-05-19 10:55:21 -0700 + Commit: fb90273, github.com/apache/spark/pull/5914 + + [SPARK-7704] Updating Programming Guides per SPARK-4397 + Dice + 2015-05-19 18:12:05 +0100 + Commit: 32fa611, github.com/apache/spark/pull/6234 + + [SPARK-7681] [MLLIB] remove mima excludes for 1.3 + Xiangrui Meng + 2015-05-19 08:24:57 -0700 + Commit: 6845cb2, github.com/apache/spark/pull/6254 + + [SPARK-7723] Fix string interpolation in pipeline examples + Saleem Ansari + 2015-05-19 10:31:11 +0100 + Commit: df34793, github.com/apache/spark/pull/6258 + + [HOTFIX] Revert "[SPARK-7092] Update spark scala version to 2.11.6" + Patrick Wendell + 2015-05-19 02:28:41 -0700 + Commit: 27fa88b + + Fixing a few basic typos in the Programming Guide. + Mike Dusenberry + 2015-05-19 08:59:45 +0100 + Commit: 61f164d, github.com/apache/spark/pull/6240 + + [SPARK-7581] [ML] [DOC] User guide for spark.ml PolynomialExpansion + Xusen Yin + 2015-05-19 00:06:33 -0700 + Commit: 6008ec1, github.com/apache/spark/pull/6113 + + [HOTFIX] Fixing style failures in Kinesis source + Patrick Wendell + 2015-05-19 00:02:06 -0700 + Commit: 23cf897 + + [HOTFIX]: Java 6 Build Breaks + Patrick Wendell + 2015-05-19 06:00:13 +0000 + Commit: 9ebb44f + + [SPARK-7687] [SQL] DataFrame.describe() should cast all aggregates to String + Josh Rosen + 2015-05-18 21:53:44 -0700 + Commit: c9fa870, github.com/apache/spark/pull/6218 + + [SPARK-7150] SparkContext.range() and SQLContext.range() + Daoyuan Wang , Davies Liu + 2015-05-18 21:43:12 -0700 + Commit: c2437de, github.com/apache/spark/pull/6081 + + [SPARK-7681] [MLLIB] Add SparseVector support for gemv + Liang-Chi Hsieh + 2015-05-18 21:32:36 -0700 + Commit: d03638c, github.com/apache/spark/pull/6209 + + [SPARK-7692] Updated Kinesis examples + Tathagata Das + 2015-05-18 18:24:15 -0700 + Commit: 3a60038, github.com/apache/spark/pull/6249 + + [SPARK-7621] [STREAMING] Report Kafka errors to StreamingListeners + jerluc + 2015-05-18 18:13:29 -0700 + Commit: 0a7a94e, github.com/apache/spark/pull/6204 + + [SPARK-7624] Revert #4147 + Davies Liu + 2015-05-18 16:55:45 -0700 + Commit: 4fb52f9, github.com/apache/spark/pull/6172 + + [SQL] Fix serializability of ORC table scan + Michael Armbrust + 2015-05-18 15:24:31 -0700 + Commit: eb4632f, github.com/apache/spark/pull/6247 + + [SPARK-7063] when lz4 compression is used, it causes core dump + Jihong MA + 2015-05-18 22:47:50 +0100 + Commit: 6525fc0, github.com/apache/spark/pull/6226 + + [SPARK-7501] [STREAMING] DAG visualization: show DStream operations + Andrew Or + 2015-05-18 14:33:33 -0700 + Commit: b93c97d, github.com/apache/spark/pull/6034 + + [HOTFIX] Fix ORC build break + Michael Armbrust + 2015-05-18 14:04:04 -0700 + Commit: fcf90b7, github.com/apache/spark/pull/6244 + + [SPARK-7658] [STREAMING] [WEBUI] Update the mouse behaviors for the timeline graphs + zsxwing + 2015-05-18 13:34:43 -0700 + Commit: 0b6f503, github.com/apache/spark/pull/6168 + + [SPARK-6216] [PYSPARK] check python version of worker with driver + Davies Liu + 2015-05-18 12:55:13 -0700 + Commit: 32fbd29, github.com/apache/spark/pull/6203 + + [SPARK-7673] [SQL] WIP: HadoopFsRelation and ParquetRelation2 performance optimizations + Cheng Lian + 2015-05-18 12:45:37 -0700 + Commit: 9dadf01, github.com/apache/spark/pull/6225 + + [SPARK-7567] [SQL] [follow-up] Use a new flag to set output committer based on mapreduce apis + Yin Huai + 2015-05-18 12:17:10 -0700 + Commit: 530397b, github.com/apache/spark/pull/6130 + + [SPARK-7269] [SQL] Incorrect analysis for aggregation(use semanticEquals) + Wenchen Fan + 2015-05-18 12:08:28 -0700 + Commit: 103c863, github.com/apache/spark/pull/6173 + + [SPARK-7631] [SQL] treenode argString should not print children + scwf + 2015-05-18 12:05:14 -0700 + Commit: fc2480e, github.com/apache/spark/pull/6144 + + [SPARK-2883] [SQL] ORC data source for Spark SQL + Zhan Zhang , Cheng Lian + 2015-05-18 12:03:27 -0700 + Commit: aa31e43, github.com/apache/spark/pull/6194 + + [SPARK-7380] [MLLIB] pipeline stages should be copyable in Python + Xiangrui Meng , Joseph K. Bradley + 2015-05-18 12:02:18 -0700 + Commit: 9c7e802, github.com/apache/spark/pull/6088 + + [SQL] [MINOR] [THIS] use private for internal field in ScalaUdf + Wenchen Fan + 2015-05-18 12:01:30 -0700 + Commit: 56ede88, github.com/apache/spark/pull/6235 + + [SPARK-7570] [SQL] Ignores _temporary during partition discovery + Cheng Lian + 2015-05-18 11:59:44 -0700 + Commit: 010a1c2, github.com/apache/spark/pull/6091 + + [SPARK-6888] [SQL] Make the jdbc driver handling user-definable + Rene Treffer + 2015-05-18 11:55:36 -0700 + Commit: e1ac2a9, github.com/apache/spark/pull/5555 + + [SPARK-7627] [SPARK-7472] DAG visualization: style skipped stages + Andrew Or + 2015-05-18 10:59:35 -0700 + Commit: 563bfcc, github.com/apache/spark/pull/6171 + + [SPARK-7272] [MLLIB] User guide for PMML model export + Vincenzo Selvaggio + 2015-05-18 08:46:33 -0700 + Commit: 814b3da, github.com/apache/spark/pull/6219 + + [SPARK-6657] [PYSPARK] Fix doc warnings + Xiangrui Meng + 2015-05-18 08:35:14 -0700 + Commit: 1ecfac6, github.com/apache/spark/pull/6221 + + [SPARK-7299][SQL] Set precision and scale for Decimal according to JDBC metadata instead of returned BigDecimal + Liang-Chi Hsieh + 2015-05-18 01:10:55 -0700 + Commit: e32c0f6, github.com/apache/spark/pull/5833 + + [SPARK-7694] [MLLIB] Use getOrElse for getting the threshold of LR model + Shuo Xiang + 2015-05-17 21:16:52 -0700 + Commit: 775e6f9, github.com/apache/spark/pull/6224 + + [SPARK-7693][Core] Remove "import scala.concurrent.ExecutionContext.Implicits.global" + zsxwing + 2015-05-17 20:37:19 -0700 + Commit: ff71d34, github.com/apache/spark/pull/6223 + + [SQL] [MINOR] use catalyst type converter in ScalaUdf + Wenchen Fan + 2015-05-17 16:51:57 -0700 + Commit: 2f22424, github.com/apache/spark/pull/6182 + + [SPARK-6514] [SPARK-5960] [SPARK-6656] [SPARK-7679] [STREAMING] [KINESIS] Updates to the Kinesis API + Tathagata Das + 2015-05-17 16:49:07 -0700 + Commit: ca4257a, github.com/apache/spark/pull/6147 + + [SPARK-7491] [SQL] Allow configuration of classloader isolation for hive + Michael Armbrust + 2015-05-17 12:43:15 -0700 + Commit: 2ca60ac, github.com/apache/spark/pull/6167 + + [SPARK-7686] [SQL] DescribeCommand is assigned wrong output attributes in SparkStrategies + Josh Rosen + 2015-05-17 11:59:28 -0700 + Commit: 5645628, github.com/apache/spark/pull/6217 + + [SPARK-7660] Wrap SnappyOutputStream to work around snappy-java bug + Josh Rosen + 2015-05-17 09:30:49 -0700 + Commit: f2cc6b5, github.com/apache/spark/pull/6176 + + [SPARK-7669] Builds against Hadoop 2.6+ get inconsistent curator depend… + Steve Loughran + 2015-05-17 17:03:11 +0100 + Commit: 5021766, github.com/apache/spark/pull/6191 + + [SPARK-7447] [SQL] Don't re-merge Parquet schema when the relation is deserialized + Liang-Chi Hsieh + 2015-05-17 15:42:21 +0800 + Commit: 3399055, github.com/apache/spark/pull/6012 + + [SQL] [MINOR] Skip unresolved expression for InConversion + scwf + 2015-05-17 15:17:11 +0800 + Commit: edf09ea, github.com/apache/spark/pull/6145 + + [MINOR] Add 1.3, 1.3.1 to master branch EC2 scripts + Shivaram Venkataraman + 2015-05-17 00:12:20 -0700 + Commit: 1a7b9ce, github.com/apache/spark/pull/6215 + + [MINOR] [SQL] Removes an unreachable case clause + Cheng Lian + 2015-05-16 23:20:09 -0700 + Commit: ba4f8ca, github.com/apache/spark/pull/6214 + + [SPARK-7654][SQL] Move JDBC into DataFrame's reader/writer interface. + Reynold Xin + 2015-05-16 22:01:53 -0700 + Commit: 517eb37, github.com/apache/spark/pull/6210 + + [SPARK-7655][Core] Deserializing value should not hold the TaskSchedulerImpl lock + zsxwing + 2015-05-16 21:03:22 -0700 + Commit: 3b6ef2c, github.com/apache/spark/pull/6195 + + [SPARK-7654][MLlib] Migrate MLlib to the DataFrame reader/writer API. + Reynold Xin + 2015-05-16 15:03:57 -0700 + Commit: 161d0b4, github.com/apache/spark/pull/6211 + + [BUILD] update jblas dependency version to 1.2.4 + Matthew Brandyberry + 2015-05-16 18:17:48 +0100 + Commit: 1b4e710, github.com/apache/spark/pull/6199 + + [HOTFIX] [SQL] Fixes DataFrameWriter.mode(String) + Cheng Lian + 2015-05-16 20:55:10 +0800 + Commit: ce63912, github.com/apache/spark/pull/6212 + + [SPARK-7655][Core][SQL] Remove 'scala.concurrent.ExecutionContext.Implicits.global' in 'ask' and 'BroadcastHashJoin' + zsxwing + 2015-05-16 00:44:29 -0700 + Commit: 47e7ffe, github.com/apache/spark/pull/6200 + + [SPARK-7672] [CORE] Use int conversion in translating kryoserializer.buffer.mb to kryoserializer.buffer + Nishkam Ravi , nishkamravi2 , nravi + 2015-05-16 08:24:21 +0100 + Commit: 0ac8b01, github.com/apache/spark/pull/6198 + + [SPARK-4556] [BUILD] binary distribution assembly can't run in local mode + Sean Owen + 2015-05-16 08:18:41 +0100 + Commit: 1fd3381, github.com/apache/spark/pull/6186 + + [SPARK-7671] Fix wrong URLs in MLlib Data Types Documentation + FavioVazquez + 2015-05-16 08:07:03 +0100 + Commit: d41ae43, github.com/apache/spark/pull/6196 + + [SPARK-7654][SQL] DataFrameReader and DataFrameWriter for input/output API + Reynold Xin + 2015-05-15 22:00:31 -0700 + Commit: 578bfee, github.com/apache/spark/pull/6175 + + [SPARK-7473] [MLLIB] Add reservoir sample in RandomForest + AiHe + 2015-05-15 20:42:35 -0700 + Commit: deb4113, github.com/apache/spark/pull/5988 + + [SPARK-7543] [SQL] [PySpark] split dataframe.py into multiple files + Davies Liu + 2015-05-15 20:09:15 -0700 + Commit: d7b6994, github.com/apache/spark/pull/6201 + + [SPARK-7073] [SQL] [PySpark] Clean up SQL data type hierarchy in Python + Davies Liu + 2015-05-15 20:05:26 -0700 + Commit: adfd366, github.com/apache/spark/pull/6206 + + [SPARK-7575] [ML] [DOC] Example code for OneVsRest + Ram Sriharsha + 2015-05-15 19:33:20 -0700 + Commit: cc12a86, github.com/apache/spark/pull/6115 + + [SPARK-7563] OutputCommitCoordinator.stop() should only run on the driver + Josh Rosen + 2015-05-15 18:06:01 -0700 + Commit: 2c04c8a, github.com/apache/spark/pull/6197 + + [SPARK-7676] Bug fix and cleanup of stage timeline view + Kay Ousterhout + 2015-05-15 17:45:14 -0700 + Commit: e745456, github.com/apache/spark/pull/6202 + + [SPARK-7556] [ML] [DOC] Add user guide for spark.ml Binarizer, including Scala, Java and Python examples + Liang-Chi Hsieh + 2015-05-15 15:05:04 -0700 + Commit: c869633, github.com/apache/spark/pull/6116 + + [SPARK-7677] [STREAMING] Add Kafka modules to the 2.11 build. + Iulian Dragos + 2015-05-15 14:57:29 -0700 + Commit: 6e77105, github.com/apache/spark/pull/6149 + + [SPARK-7226] [SPARKR] Support math functions in R DataFrame + qhuang + 2015-05-15 14:06:16 -0700 + Commit: 50da9e8, github.com/apache/spark/pull/6170 + + [SPARK-7296] Add timeline visualization for stages in the UI. + Kousuke Saruta + 2015-05-15 13:54:09 -0700 + Commit: 9b6cf28, github.com/apache/spark/pull/5843 + + [SPARK-7504] [YARN] NullPointerException when initializing SparkContext in YARN-cluster mode + ehnalis + 2015-05-15 12:14:02 -0700 + Commit: 8e3822a, github.com/apache/spark/pull/6083 + + [SPARK-7664] [WEBUI] DAG visualization: Fix incorrect link paths of DAG. + Kousuke Saruta + 2015-05-15 11:54:13 -0700 + Commit: ad92af9, github.com/apache/spark/pull/6184 + + [SPARK-5412] [DEPLOY] Cannot bind Master to a specific hostname as per the documentation + Sean Owen + 2015-05-15 11:30:19 -0700 + Commit: 8ab1450, github.com/apache/spark/pull/6185 + + [CORE] Protect additional test vars from early GC + Tim Ellison + 2015-05-15 11:27:24 -0700 + Commit: 270d4b5, github.com/apache/spark/pull/6187 + + [SPARK-7233] [CORE] Detect REPL mode once + Oleksii Kostyliev , Oleksii Kostyliev + 2015-05-15 11:19:56 -0700 + Commit: b1b9d58, github.com/apache/spark/pull/5835 + + [SPARK-7651] [MLLIB] [PYSPARK] GMM predict, predictSoft should raise error on bad input + FlytxtRnD + 2015-05-15 10:43:18 -0700 + Commit: 8f4aaba, github.com/apache/spark/pull/6180 + + [SPARK-7668] [MLLIB] Preserve isTransposed property for Matrix after calling map function + Liang-Chi Hsieh + 2015-05-15 10:03:29 -0700 + Commit: f96b85a, github.com/apache/spark/pull/6188 + + [SPARK-7503] [YARN] Resources in .sparkStaging directory can't be cleaned up on error + Kousuke Saruta + 2015-05-15 11:37:34 +0100 + Commit: c64ff80, github.com/apache/spark/pull/6026 + + [SPARK-7591] [SQL] Partitioning support API tweaks + Cheng Lian + 2015-05-15 16:20:49 +0800 + Commit: fdf5bba, github.com/apache/spark/pull/6150 + + [SPARK-6258] [MLLIB] GaussianMixture Python API parity check + Yanbo Liang + 2015-05-15 00:18:39 -0700 + Commit: 9476148, github.com/apache/spark/pull/6087 + + [SPARK-7650] [STREAMING] [WEBUI] Move streaming css and js files to the streaming project + zsxwing + 2015-05-14 23:51:41 -0700 + Commit: cf842d4, github.com/apache/spark/pull/6160 + + [CORE] Remove unreachable Heartbeat message from Worker + Kan Zhang + 2015-05-14 23:50:50 -0700 + Commit: daf4ae7, github.com/apache/spark/pull/6163 + + [HOTFIX] Add workaround for SPARK-7660 to fix JavaAPISuite failures. + Josh Rosen + 2015-05-14 23:17:41 -0700 + Commit: 7da33ce + + [SQL] When creating partitioned table scan, explicitly create UnionRDD. + Yin Huai + 2015-05-15 12:04:26 +0800 + Commit: e8f0e01, github.com/apache/spark/pull/6162 + + [SPARK-7098][SQL] Make the WHERE clause with timestamp show consistent result + Liang-Chi Hsieh + 2015-05-14 20:49:21 -0700 + Commit: f9705d4, github.com/apache/spark/pull/5682 + + [SPARK-7548] [SQL] Add explode function for DataFrames + Michael Armbrust + 2015-05-14 19:49:44 -0700 + Commit: 6d0633e, github.com/apache/spark/pull/6107 + + [SPARK-7619] [PYTHON] fix docstring signature + Xiangrui Meng + 2015-05-14 18:16:22 -0700 + Commit: 48fc38f, github.com/apache/spark/pull/6161 + + [SPARK-7648] [MLLIB] Add weights and intercept to GLM wrappers in spark.ml + Xiangrui Meng + 2015-05-14 18:13:58 -0700 + Commit: 723853e, github.com/apache/spark/pull/6156 + + [SPARK-7645] [STREAMING] [WEBUI] Show milliseconds in the UI if the batch interval < 1 second + zsxwing + 2015-05-14 16:58:36 -0700 + Commit: b208f99, github.com/apache/spark/pull/6154 + + [SPARK-7649] [STREAMING] [WEBUI] Use window.localStorage to store the status rather than the url + zsxwing + 2015-05-14 16:57:33 -0700 + Commit: 0a317c1, github.com/apache/spark/pull/6158 + + [SPARK-7643] [UI] use the correct size in RDDPage for storage info and partitions + Xiangrui Meng + 2015-05-14 16:56:32 -0700 + Commit: 57ed16c, github.com/apache/spark/pull/6157 + + [SPARK-7598] [DEPLOY] Add aliveWorkers metrics in Master + Rex Xiong + 2015-05-14 16:55:31 -0700 + Commit: 93dbb3a, github.com/apache/spark/pull/6117 + + Make SPARK prefix a variable + tedyu + 2015-05-14 15:26:35 -0700 + Commit: 11a1a13, github.com/apache/spark/pull/6153 + + [SPARK-7278] [PySpark] DateType should find datetime.datetime acceptable + ksonj + 2015-05-14 15:10:58 -0700 + Commit: 5d7d4f8, github.com/apache/spark/pull/6057 + + [SQL][minor] rename apply for QueryPlanner + Wenchen Fan + 2015-05-14 10:25:18 -0700 + Commit: f2cd00b, github.com/apache/spark/pull/6142 + + [SPARK-7249] Updated Hadoop dependencies due to inconsistency in the versions + FavioVazquez + 2015-05-14 15:22:58 +0100 + Commit: 7fb715d, github.com/apache/spark/pull/5786 + + [SPARK-7568] [ML] ml.LogisticRegression doesn't output the right prediction + DB Tsai + 2015-05-14 01:26:08 -0700 + Commit: c1080b6, github.com/apache/spark/pull/6109 + + [SPARK-7407] [MLLIB] use uid + name to identify parameters + Xiangrui Meng + 2015-05-14 01:22:15 -0700 + Commit: 1b8625f, github.com/apache/spark/pull/6019 + + [SPARK-7595] [SQL] Window will cause resolve failed with self join + linweizhong + 2015-05-14 00:23:27 -0700 + Commit: 13e652b, github.com/apache/spark/pull/6114 + + [SPARK-7620] [ML] [MLLIB] Removed calling size, length in while condition to avoid extra JVM call + DB Tsai + 2015-05-13 22:23:21 -0700 + Commit: d3db2fd, github.com/apache/spark/pull/6137 + + [SPARK-7612] [MLLIB] update NB training to use mllib's BLAS + Xiangrui Meng + 2015-05-13 21:27:17 -0700 + Commit: d5f18de, github.com/apache/spark/pull/6128 + + [HOT FIX #6125] Do not wait for all stages to start rendering + Andrew Or + 2015-05-13 21:04:13 -0700 + Commit: 3113da9, github.com/apache/spark/pull/6138 + + [HOTFIX] Use 'new Job' in fsBasedParquet.scala + zsxwing + 2015-05-13 17:58:29 -0700 + Commit: 728af88, github.com/apache/spark/pull/6136 + + [HOTFIX] Bug in merge script + Patrick Wendell + 2015-05-13 17:55:06 -0700 + Commit: 32e27df + + [SPARK-6752] [STREAMING] [REVISED] Allow StreamingContext to be recreated from checkpoint and existing SparkContext + Tathagata Das + 2015-05-13 17:33:15 -0700 + Commit: bce00da, github.com/apache/spark/pull/6096 + + [SPARK-7601] [SQL] Support Insert into JDBC Datasource + Venkata Ramana Gollamudi + 2015-05-13 17:24:04 -0700 + Commit: 59aaa1d, github.com/apache/spark/pull/6121 + + [SPARK-7081] Faster sort-based shuffle path using binary processing cache-aware sort + Josh Rosen + 2015-05-13 17:07:31 -0700 + Commit: 73bed40, github.com/apache/spark/pull/5868 + + [SPARK-7356] [STREAMING] Fix flakey tests in FlumePollingStreamSuite using SparkSink's batch CountDownLatch. + Hari Shreedharan + 2015-05-13 16:43:30 -0700 + Commit: 61d1e87, github.com/apache/spark/pull/5918 + + [STREAMING] [MINOR] Keep streaming.UIUtils private + Andrew Or + 2015-05-13 16:31:24 -0700 + Commit: bb6dec3, github.com/apache/spark/pull/6134 + + [SPARK-7502] DAG visualization: gracefully handle removed stages + Andrew Or + 2015-05-13 16:29:52 -0700 + Commit: aa18378, github.com/apache/spark/pull/6132 + + [SPARK-7464] DAG visualization: highlight the same RDDs on hover + Andrew Or + 2015-05-13 16:29:10 -0700 + Commit: 4440341, github.com/apache/spark/pull/6100 + + [SPARK-7399] Spark compilation error for scala 2.11 + Andrew Or + 2015-05-13 16:28:37 -0700 + Commit: f88ac70, github.com/apache/spark/pull/6129 + + [SPARK-7608] Clean up old state in RDDOperationGraphListener + Andrew Or + 2015-05-13 16:27:48 -0700 + Commit: f6e1838, github.com/apache/spark/pull/6125 + + [SQL] Move some classes into packages that are more appropriate. + Reynold Xin + 2015-05-13 16:15:31 -0700 + Commit: e683182, github.com/apache/spark/pull/6108 + + [SPARK-7303] [SQL] push down project if possible when the child is sort + scwf + 2015-05-13 16:13:48 -0700 + Commit: 59250fe, github.com/apache/spark/pull/5838 + + [SPARK-7382] [MLLIB] Feature Parity in PySpark for ml.classification + Burak Yavuz + 2015-05-13 15:13:09 -0700 + Commit: df2fb13, github.com/apache/spark/pull/6106 + + [SPARK-7545] [MLLIB] Added check in Bernoulli Naive Bayes to make sure that both training and predict features have values of 0 or 1 + leahmcguire + 2015-05-13 14:13:19 -0700 + Commit: 61e05fc, github.com/apache/spark/pull/6073 + + [SPARK-7593] [ML] Python Api for ml.feature.Bucketizer + Burak Yavuz + 2015-05-13 13:21:36 -0700 + Commit: 5db18ba, github.com/apache/spark/pull/6124 + + [MINOR] [CORE] Accept alternative mesos unsatisfied link error in test. + Tim Ellison , Tim Ellison + 2015-05-13 21:16:32 +0100 + Commit: 51030b8, github.com/apache/spark/pull/6119 + + [MINOR] Enhance SizeEstimator to detect IBM compressed refs and s390 … + Tim Ellison + 2015-05-13 21:01:42 +0100 + Commit: 3cd9ad2, github.com/apache/spark/pull/6085 + + [MINOR] Avoid passing the PermGenSize option to IBM JVMs. + Tim Ellison , Tim Ellison + 2015-05-13 21:00:12 +0100 + Commit: e676fc0, github.com/apache/spark/pull/6055 + + [SPARK-7551][DataFrame] support backticks for DataFrame attribute resolution + Wenchen Fan + 2015-05-13 12:47:48 -0700 + Commit: 213a6f3, github.com/apache/spark/pull/6074 + + [SPARK-7567] [SQL] Migrating Parquet data source to FSBasedRelation + Cheng Lian + 2015-05-13 11:04:10 -0700 + Commit: 7ff16e8, github.com/apache/spark/pull/6090 + + [SPARK-7589] [STREAMING] [WEBUI] Make "Input Rate" in the Streaming page consistent with other pages + zsxwing + 2015-05-13 10:01:26 -0700 + Commit: bec938f, github.com/apache/spark/pull/6102 + + [SPARK-6734] [SQL] Add UDTF.close support in Generate + Cheng Hao + 2015-05-14 00:14:59 +0800 + Commit: 0da254f, github.com/apache/spark/pull/5383 + + [MINOR] [SQL] Removes debugging println + Cheng Lian + 2015-05-13 23:40:13 +0800 + Commit: aa6ba3f, github.com/apache/spark/pull/6123 + + [SQL] In InsertIntoFSBasedRelation.insert, log cause before abort job/task. + Yin Huai + 2015-05-13 23:36:19 +0800 + Commit: b061bd5, github.com/apache/spark/pull/6105 + + [SPARK-7599] [SQL] Don't restrict customized output committers to be subclasses of FileOutputCommitter + Cheng Lian + 2015-05-13 07:35:55 -0700 + Commit: 10c546e, github.com/apache/spark/pull/6118 + + [SPARK-6568] spark-shell.cmd --jars option does not accept the jar that has space in its path + Masayoshi TSUZUKI , Kousuke Saruta + 2015-05-13 09:43:40 +0100 + Commit: 50c7270, github.com/apache/spark/pull/5447 + + [SPARK-7526] [SPARKR] Specify ip of RBackend, MonitorServer and RRDD Socket server + linweizhong + 2015-05-12 23:55:44 -0700 + Commit: 98195c3, github.com/apache/spark/pull/6053 + + [SPARK-7482] [SPARKR] Rename some DataFrame API methods in SparkR to match their counterparts in Scala. + Sun Rui + 2015-05-12 23:52:30 -0700 + Commit: df9b94a, github.com/apache/spark/pull/6007 + + [SPARK-7566][SQL] Add type to HiveContext.analyzer + Santiago M. Mola + 2015-05-12 23:44:21 -0700 + Commit: 208b902, github.com/apache/spark/pull/6086 + + [SPARK-7321][SQL] Add Column expression for conditional statements (when/otherwise) + Reynold Xin , kaka1992 + 2015-05-12 21:43:34 -0700 + Commit: 97dee31, github.com/apache/spark/pull/6072 + + [SPARK-7588] Document all SQL/DataFrame public methods with @since tag + Reynold Xin + 2015-05-12 18:37:02 -0700 + Commit: 8fd5535, github.com/apache/spark/pull/6101 + + [SPARK-7592] Always set resolution to "Fixed" in PR merge script. + Patrick Wendell + 2015-05-12 18:20:54 -0700 + Commit: 1b9e434, github.com/apache/spark/pull/6103 + + [HOTFIX] Use the old Job API to support old Hadoop versions + zsxwing + 2015-05-13 08:33:24 +0800 + Commit: 247b703, github.com/apache/spark/pull/6095 + + [SPARK-7572] [MLLIB] do not import Param/Params under pyspark.ml + Xiangrui Meng + 2015-05-12 17:15:39 -0700 + Commit: 77f64c7, github.com/apache/spark/pull/6094 + + [SPARK-7554] [STREAMING] Throw exception when an active/stopped StreamingContext is used to create DStreams and output operations + Tathagata Das + 2015-05-12 17:07:21 -0700 + Commit: 23f7d66, github.com/apache/spark/pull/6099 + + [SPARK-7528] [MLLIB] make RankingMetrics Java-friendly + Xiangrui Meng + 2015-05-12 16:53:47 -0700 + Commit: 2713bc6, github.com/apache/spark/pull/6098 + + [SPARK-7553] [STREAMING] Added methods to maintain a singleton StreamingContext + Tathagata Das + 2015-05-12 16:44:14 -0700 + Commit: 00e7b09, github.com/apache/spark/pull/6070 + + [SPARK-7573] [ML] OneVsRest cleanups + Joseph K. Bradley + 2015-05-12 16:42:30 -0700 + Commit: 96c4846, github.com/apache/spark/pull/6097 + + [SPARK-7557] [ML] [DOC] User guide for spark.ml HashingTF, Tokenizer + Joseph K. Bradley + 2015-05-12 16:39:56 -0700 + Commit: f0c1bc3, github.com/apache/spark/pull/6093 + + [SPARK-7496] [MLLIB] Update Programming guide with Online LDA + Yuhao Yang + 2015-05-12 15:12:29 -0700 + Commit: 1d70366, github.com/apache/spark/pull/6046 + + [SPARK-7406] [STREAMING] [WEBUI] Add tooltips for "Scheduling Delay", "Processing Time" and "Total Delay" + zsxwing + 2015-05-12 14:41:21 -0700 + Commit: 1422e79, github.com/apache/spark/pull/5952 + + [SPARK-7571] [MLLIB] rename Math to math + Xiangrui Meng + 2015-05-12 14:39:03 -0700 + Commit: a4874b0, github.com/apache/spark/pull/6092 + + [SPARK-7484][SQL]Support jdbc connection properties + Venkata Ramana Gollamudi + 2015-05-12 14:37:23 -0700 + Commit: 455551d, github.com/apache/spark/pull/6009 + + [SPARK-7559] [MLLIB] Bucketizer should include the right most boundary in the last bucket. + Xiangrui Meng + 2015-05-12 14:24:26 -0700 + Commit: 23b9863, github.com/apache/spark/pull/6075 + + [SPARK-7569][SQL] Better error for invalid binary expressions + Michael Armbrust + 2015-05-12 13:36:55 -0700 + Commit: 2a41c0d, github.com/apache/spark/pull/6089 + + [SPARK-7015] [MLLIB] [WIP] Multiclass to Binary Reduction: One Against All + Ram Sriharsha + 2015-05-12 13:35:12 -0700 + Commit: 595a675, github.com/apache/spark/pull/5830 + + [SPARK-2018] [CORE] Upgrade LZF library to fix endian serialization p… + Tim Ellison + 2015-05-12 20:48:26 +0100 + Commit: 5438f49, github.com/apache/spark/pull/6077 + + [SPARK-7487] [ML] Feature Parity in PySpark for ml.regression + Burak Yavuz + 2015-05-12 12:17:05 -0700 + Commit: 8e935b0, github.com/apache/spark/pull/6016 + + [HOT FIX #6076] DAG visualization: curve the edges + Andrew Or + 2015-05-12 12:06:30 -0700 + Commit: b9b01f4 + + [SPARK-7276] [DATAFRAME] speed up DataFrame.select by collapsing Project + Wenchen Fan + 2015-05-12 11:51:55 -0700 + Commit: 4e29052, github.com/apache/spark/pull/5831 + + [SPARK-7500] DAG visualization: move cluster labeling to dagre-d3 + Andrew Or + 2015-05-12 11:17:59 -0700 + Commit: 65697bb, github.com/apache/spark/pull/6076 + + [DataFrame][minor] support column in field accessor + Wenchen Fan + 2015-05-12 10:37:57 -0700 + Commit: bfcaf8a, github.com/apache/spark/pull/6080 + + [SPARK-3928] [SPARK-5182] [SQL] Partitioning support for the data sources API + Cheng Lian + 2015-05-13 01:32:28 +0800 + Commit: 0595b6d, github.com/apache/spark/pull/5526 + + [DataFrame][minor] cleanup unapply methods in DataTypes + Wenchen Fan + 2015-05-12 10:28:40 -0700 + Commit: 831504c, github.com/apache/spark/pull/6079 + + [SPARK-6876] [PySpark] [SQL] add DataFrame na.replace in pyspark + Daoyuan Wang + 2015-05-12 10:23:41 -0700 + Commit: d86ce84, github.com/apache/spark/pull/6003 + + [SPARK-7532] [STREAMING] StreamingContext.start() made to logWarning and not throw exception + Tathagata Das + 2015-05-12 08:48:24 -0700 + Commit: ec6f2a9, github.com/apache/spark/pull/6060 + + [SPARK-7467] Dag visualization: treat checkpoint as an RDD operation + Andrew Or + 2015-05-12 01:40:55 -0700 + Commit: f3e8e60, github.com/apache/spark/pull/6004 + + [SPARK-7485] [BUILD] Remove pyspark files from assembly. + Marcelo Vanzin + 2015-05-12 01:39:21 -0700 + Commit: 82e890f, github.com/apache/spark/pull/6022 + + [MINOR] [PYSPARK] Set PYTHONPATH to python/lib/pyspark.zip rather than python/pyspark + linweizhong + 2015-05-12 01:36:27 -0700 + Commit: 9847875, github.com/apache/spark/pull/6047 + + [SPARK-7534] [CORE] [WEBUI] Fix the Stage table when a stage is missing + zsxwing + 2015-05-12 01:34:33 -0700 + Commit: 8a4edec, github.com/apache/spark/pull/6061 + + [SPARK-6994][SQL] Update docs for fetching Row fields by name + vidmantas zemleris + 2015-05-11 22:29:24 -0700 + Commit: 640f63b, github.com/apache/spark/pull/6030 + + [SQL] Rename Dialect -> ParserDialect. + Reynold Xin + 2015-05-11 22:06:56 -0700 + Commit: 1669675, github.com/apache/spark/pull/6071 + + [SPARK-7435] [SPARKR] Make DataFrame.show() consistent with that of Scala and pySpark + Joshi , Rekha Joshi + 2015-05-11 21:02:34 -0700 + Commit: b94a933, github.com/apache/spark/pull/5989 + + [SPARK-7509][SQL] DataFrame.drop in Python for dropping columns. + Reynold Xin + 2015-05-11 20:04:36 -0700 + Commit: 028ad4b, github.com/apache/spark/pull/6068 + + [SPARK-7437] [SQL] Fold "literal in (item1, item2, ..., literal, ...)" into true or false directly + Zhongshuai Pei <799203320@qq.com>, DoingDone9 <799203320@qq.com> + 2015-05-11 19:22:44 -0700 + Commit: 4b5e1fe, github.com/apache/spark/pull/5972 + + [SPARK-7411] [SQL] Support SerDe for HiveQl in CTAS + Cheng Hao + 2015-05-11 19:21:16 -0700 + Commit: e35d878, github.com/apache/spark/pull/5963 + + [SPARK-7324] [SQL] DataFrame.dropDuplicates + Reynold Xin + 2015-05-11 19:15:14 -0700 + Commit: b6bf4f7, github.com/apache/spark/pull/6066 + + [SPARK-7530] [STREAMING] Added StreamingContext.getState() to expose the current state of the context + Tathagata Das + 2015-05-11 18:53:50 -0700 + Commit: f9c7580, github.com/apache/spark/pull/6058 + + [SPARK-5893] [ML] Add bucketizer + Xusen Yin , Joseph K. Bradley + 2015-05-11 18:41:22 -0700 + Commit: 35fb42a, github.com/apache/spark/pull/5980 + + Updated DataFrame.saveAsTable Hive warning to include SPARK-7550 ticket. + Reynold Xin + 2015-05-11 18:10:45 -0700 + Commit: 87229c9, github.com/apache/spark/pull/6067 + + [SPARK-7462][SQL] Update documentation for retaining grouping columns in DataFrames. + Reynold Xin + 2015-05-11 18:07:12 -0700 + Commit: 3a9b699, github.com/apache/spark/pull/6062 + + [SPARK-7084] improve saveAsTable documentation + madhukar + 2015-05-11 17:04:11 -0700 + Commit: 57255dc, github.com/apache/spark/pull/5654 + + [SQL] Show better error messages for incorrect join types in DataFrames. + Reynold Xin + 2015-05-11 17:02:11 -0700 + Commit: 4f4dbb0, github.com/apache/spark/pull/6064 + + [MINOR] [DOCS] Fix the link to test building info on the wiki + Sean Owen + 2015-05-12 00:25:43 +0100 + Commit: 91dc3df, github.com/apache/spark/pull/6063 + + Update Documentation: leftsemi instead of semijoin + LCY Vincent + 2015-05-11 14:48:10 -0700 + Commit: a8ea096, github.com/apache/spark/pull/5944 + + [STREAMING] [MINOR] Close files correctly when iterator is finished in streaming WAL recovery + jerryshao + 2015-05-11 14:38:58 -0700 + Commit: 25c01c5, github.com/apache/spark/pull/6050 + + [SPARK-7516] [Minor] [DOC] Replace depreciated inferSchema() with createDataFrame() + gchen + 2015-05-11 14:37:18 -0700 + Commit: 8e67433, github.com/apache/spark/pull/6041 + + [SPARK-7515] [DOC] Update documentation for PySpark on YARN with cluster mode + Kousuke Saruta + 2015-05-11 14:19:11 -0700 + Commit: 6e9910c, github.com/apache/spark/pull/6040 + + [SPARK-7508] JettyUtils-generated servlets to log & report all errors + Steve Loughran + 2015-05-11 13:35:06 -0700 + Commit: 7ce2a33, github.com/apache/spark/pull/6033 + + [SPARK-6470] [YARN] Add support for YARN node labels. + Sandy Ryza + 2015-05-11 12:09:39 -0700 + Commit: 82fee9d, github.com/apache/spark/pull/5242 + + [SPARK-7462] By default retain group by columns in aggregate + Reynold Xin , Shivaram Venkataraman + 2015-05-11 11:35:16 -0700 + Commit: 0a4844f, github.com/apache/spark/pull/5996 + + [SPARK-7361] [STREAMING] Throw unambiguous exception when attempting to start multiple StreamingContexts in the same JVM + Tathagata Das + 2015-05-11 10:58:56 -0700 + Commit: 1b46556, github.com/apache/spark/pull/5907 + + [SPARK-7522] [EXAMPLES] Removed angle brackets from dataFormat option + Bryan Cutler + 2015-05-11 09:23:47 -0700 + Commit: 4f8a155, github.com/apache/spark/pull/6049 + + [SPARK-6092] [MLLIB] Add RankingMetrics in PySpark/MLlib + Yanbo Liang + 2015-05-11 09:14:20 -0700 + Commit: 042dda3, github.com/apache/spark/pull/6044 + + [SPARK-7326] [STREAMING] Performing window() on a WindowedDStream doesn't work all the time + Wesley Miao , Wesley + 2015-05-11 12:20:06 +0100 + Commit: d70a076, github.com/apache/spark/pull/5871 + + [SPARK-7519] [SQL] fix minor bugs in thrift server UI + tianyi + 2015-05-11 14:08:15 +0800 + Commit: 2242ab3, github.com/apache/spark/pull/6048 + + [SPARK-7512] [SPARKR] Fix RDD's show method to use getJRDD + Shivaram Venkataraman + 2015-05-10 19:49:42 -0700 + Commit: 0835f1e, github.com/apache/spark/pull/6035 + + [SPARK-7427] [PYSPARK] Make sharedParams match in Scala, Python + Glenn Weidner + 2015-05-10 19:18:32 -0700 + Commit: c5aca0c, github.com/apache/spark/pull/6023 + + [SPARK-5521] PCA wrapper for easy transform vectors + Kirill A. Korinskiy , Joseph K. Bradley + 2015-05-10 13:34:00 -0700 + Commit: 8c07c75, github.com/apache/spark/pull/4304 + + [SPARK-7431] [ML] [PYTHON] Made CrossValidatorModel call parent init in PySpark + Joseph K. Bradley + 2015-05-10 13:29:27 -0700 + Commit: 3038443, github.com/apache/spark/pull/5968 + + [MINOR] [SQL] Fixes variable name typo + Cheng Lian + 2015-05-10 21:26:36 +0800 + Commit: 6bf9352, github.com/apache/spark/pull/6038 + + [SPARK-7345][SQL] Spark cannot detect renamed columns using JDBC connector + Oleg Sidorkin + 2015-05-10 01:31:34 -0700 + Commit: d7a37bc, github.com/apache/spark/pull/6032 + + [SPARK-6091] [MLLIB] Add MulticlassMetrics in PySpark/MLlib + Yanbo Liang + 2015-05-10 00:57:14 -0700 + Commit: bf7e81a, github.com/apache/spark/pull/6011 + + [SPARK-7475] [MLLIB] adjust ldaExample for online LDA + Yuhao Yang + 2015-05-09 15:40:46 -0700 + Commit: b13162b, github.com/apache/spark/pull/6000 + + [BUILD] Reference fasterxml.jackson.version in sql/core/pom.xml + tedyu + 2015-05-09 13:19:07 -0700 + Commit: bd74301, github.com/apache/spark/pull/6031 + + Upgrade version of jackson-databind in sql/core/pom.xml + tedyu + 2015-05-09 10:41:30 -0700 + Commit: 3071aac, github.com/apache/spark/pull/6028 + + [STREAMING] [DOCS] Fix wrong url about API docs of StreamingListener + dobashim + 2015-05-09 10:14:46 +0100 + Commit: 7d0f172, github.com/apache/spark/pull/6024 + + [SPARK-7403] [WEBUI] Link URL in objects on Timeline View is wrong in case of running on YARN + Kousuke Saruta + 2015-05-09 10:10:29 +0100 + Commit: 12b95ab, github.com/apache/spark/pull/5947 + + [SPARK-7438] [SPARK CORE] Fixed validation of relativeSD in countApproxDistinct + Vinod K C + 2015-05-09 10:03:15 +0100 + Commit: dda6d9f, github.com/apache/spark/pull/5974 + + [SPARK-7498] [ML] removed varargs annotation from Params.setDefaults + Joseph K. Bradley + 2015-05-08 21:55:54 -0700 + Commit: 2992623, github.com/apache/spark/pull/6021 + + [SPARK-7262] [ML] Binary LogisticRegression with L1/L2 (elastic net) using OWLQN in new ML package + DB Tsai + 2015-05-08 21:43:05 -0700 + Commit: 86ef4cf, github.com/apache/spark/pull/5967 + + [SPARK-7375] [SQL] Avoid row copying in exchange when sort.serializeMapOutputs takes effect + Josh Rosen + 2015-05-08 22:09:55 -0400 + Commit: cde5483, github.com/apache/spark/pull/5948 + + [SPARK-7231] [SPARKR] Changes to make SparkR DataFrame dplyr friendly. + Shivaram Venkataraman + 2015-05-08 18:29:57 -0700 + Commit: 0a901dd, github.com/apache/spark/pull/6005 + + [SPARK-7451] [YARN] Preemption of executors is counted as failure causing Spark job to fail + Ashwin Shankar + 2015-05-08 17:51:00 -0700 + Commit: b6c797b, github.com/apache/spark/pull/5993 + + [SPARK-7488] [ML] Feature Parity in PySpark for ml.recommendation + Burak Yavuz + 2015-05-08 17:24:32 -0700 + Commit: 84bf931, github.com/apache/spark/pull/6015 + + [SPARK-7237] Clean function in several RDD methods + tedyu + 2015-05-08 17:16:38 -0700 + Commit: 54e6fa0, github.com/apache/spark/pull/5959 + + [SPARK-7469] [SQL] DAG visualization: show SQL query operators + Andrew Or + 2015-05-08 17:15:10 -0700 + Commit: bd61f07, github.com/apache/spark/pull/5999 + + [SPARK-6955] Perform port retries at NettyBlockTransferService level + Aaron Davidson + 2015-05-08 17:13:55 -0700 + Commit: ffdc40c, github.com/apache/spark/pull/5575 + + updated ec2 instance types + Brendan Collins + 2015-05-08 15:59:34 -0700 + Commit: 1c78f68, github.com/apache/spark/pull/6014 + + [SPARK-5913] [MLLIB] Python API for ChiSqSelector + Yanbo Liang + 2015-05-08 15:48:39 -0700 + Commit: 35c9599, github.com/apache/spark/pull/5939 + + [SPARK-4699] [SQL] Make caseSensitive configurable in spark sql analyzer + Jacky Li , wangfei , scwf + 2015-05-08 15:25:54 -0700 + Commit: 6dad76e, github.com/apache/spark/pull/5806 + + [SPARK-7390] [SQL] Only merge other CovarianceCounter when its count is greater than zero + Liang-Chi Hsieh + 2015-05-08 14:41:16 -0700 + Commit: 90527f5, github.com/apache/spark/pull/5931 + + [SPARK-7378] [CORE] Handle deep links to unloaded apps. + Marcelo Vanzin + 2015-05-08 14:12:58 -0700 + Commit: 5467c34, github.com/apache/spark/pull/5922 + + [MINOR] [CORE] Allow History Server to read kerberos opts from config file. + Marcelo Vanzin + 2015-05-08 14:10:27 -0700 + Commit: 9042f8f, github.com/apache/spark/pull/5998 + + [SPARK-7466] DAG visualization: fix orphan nodes + Andrew Or + 2015-05-08 14:09:39 -0700 + Commit: 3b0c5e7, github.com/apache/spark/pull/6002 + + [MINOR] Defeat early garbage collection of test suite variable + Tim Ellison + 2015-05-08 14:08:52 -0700 + Commit: 31da40d, github.com/apache/spark/pull/6010 + + [SPARK-7489] [SPARK SHELL] Spark shell crashes when compiled with scala 2.11 + vinodkc + 2015-05-08 14:07:53 -0700 + Commit: 4e7360e, github.com/apache/spark/pull/6013 + + [WEBUI] Remove debug feature for vis.js + Kousuke Saruta + 2015-05-08 14:06:37 -0700 + Commit: c45c09b, github.com/apache/spark/pull/5994 + + [MINOR] Ignore python/lib/pyspark.zip + zsxwing + 2015-05-08 14:06:02 -0700 + Commit: dc71e47, github.com/apache/spark/pull/6017 + + [SPARK-7490] [CORE] [Minor] MapOutputTracker.deserializeMapStatuses: close input streams + Evan Jones + 2015-05-08 22:00:39 +0100 + Commit: 25889d8, github.com/apache/spark/pull/5982 + + [SPARK-6627] Finished rename to ShuffleBlockResolver + Kay Ousterhout + 2015-05-08 12:24:06 -0700 + Commit: 4b3bb0e, github.com/apache/spark/pull/5764 + + [SPARK-7133] [SQL] Implement struct, array, and map field accessor + Wenchen Fan + 2015-05-08 11:49:38 -0700 + Commit: 2d05f32, github.com/apache/spark/pull/5744 + + [SPARK-7298] Harmonize style of new visualizations + Matei Zaharia + 2015-05-08 14:41:42 -0400 + Commit: a1ec08f, github.com/apache/spark/pull/5942 + + [SPARK-7436] Fixed instantiation of custom recovery mode factory and added tests + Jacek Lewandowski + 2015-05-08 11:36:30 -0700 + Commit: 35d6a99, github.com/apache/spark/pull/5977 + + [SPARK-6824] Fill the docs for DataFrame API in SparkR + hqzizania , qhuang + 2015-05-08 11:25:04 -0700 + Commit: 008a60d, github.com/apache/spark/pull/5969 + + [SPARK-7474] [MLLIB] update ParamGridBuilder doctest + Xiangrui Meng + 2015-05-08 11:16:04 -0700 + Commit: 65afd3c, github.com/apache/spark/pull/6001 + + [SPARK-7383] [ML] Feature Parity in PySpark for ml.features + Burak Yavuz + 2015-05-08 11:14:39 -0700 + Commit: f5ff4a8, github.com/apache/spark/pull/5991 + + [SPARK-3454] separate json endpoints for data in the UI + Imran Rashid + 2015-05-08 16:54:32 +0100 + Commit: c796be7, github.com/apache/spark/pull/5940 + + [SPARK-6869] [PYSPARK] Add pyspark archives path to PYTHONPATH + Lianhui Wang + 2015-05-08 08:44:46 -0500 + Commit: ebff732, github.com/apache/spark/pull/5580 + + [SPARK-7392] [CORE] bugfix: Kryo buffer size cannot be larger than 2M + Zhang, Liye + 2015-05-08 09:10:58 +0100 + Commit: c2f0821, github.com/apache/spark/pull/5934 + + [SPARK-7232] [SQL] Add a Substitution batch for spark sql analyzer + wangfei + 2015-05-07 22:55:42 -0700 + Commit: f496bf3, github.com/apache/spark/pull/5776 + + [SPARK-7470] [SQL] Spark shell SQLContext crashes without hive + Andrew Or + 2015-05-07 22:32:13 -0700 + Commit: 714db2e, github.com/apache/spark/pull/5997 + + [SPARK-6986] [SQL] Use Serializer2 in more cases. + Yin Huai + 2015-05-07 20:59:42 -0700 + Commit: 3af423c, github.com/apache/spark/pull/5849 + + [SPARK-7452] [MLLIB] fix bug in topBykey and update test + Shuo Xiang + 2015-05-07 20:55:08 -0700 + Commit: 92f8f80, github.com/apache/spark/pull/5990 + + [SPARK-6908] [SQL] Use isolated Hive client + Michael Armbrust + 2015-05-07 19:36:24 -0700 + Commit: cd1d411, github.com/apache/spark/pull/5876 + + [SPARK-7305] [STREAMING] [WEBUI] Make BatchPage show friendly information when jobs are dropped by SparkListener + zsxwing + 2015-05-07 17:34:44 -0700 + Commit: 22ab70e, github.com/apache/spark/pull/5840 + + [SPARK-7450] Use UNSAFE.getLong() to speed up BitSetMethods#anySet() + tedyu + 2015-05-07 16:53:59 -0700 + Commit: 88063c6, github.com/apache/spark/pull/5897 + + [SPARK-2155] [SQL] [WHEN D THEN E] [ELSE F] add CaseKeyWhen for "CASE a WHEN b THEN c * END" + Wenchen Fan + 2015-05-07 16:26:49 -0700 + Commit: 35f0173, github.com/apache/spark/pull/5979 + + [SPARK-5281] [SQL] Registering table on RDD is giving MissingRequirementError + Iulian Dragos + 2015-05-07 16:24:11 -0700 + Commit: 937ba79, github.com/apache/spark/pull/5981 + + [SPARK-7277] [SQL] Throw exception if the property mapred.reduce.tasks is set to -1 + Liang-Chi Hsieh + 2015-05-07 16:22:45 -0700 + Commit: ea3077f, github.com/apache/spark/pull/5811 + + [SQL] [MINOR] make star and multialias extend NamedExpression + scwf + 2015-05-07 16:21:24 -0700 + Commit: 97d1182, github.com/apache/spark/pull/5928 + + [SPARK-6948] [MLLIB] compress vectors in VectorAssembler + Xiangrui Meng + 2015-05-07 15:45:37 -0700 + Commit: e43803b, github.com/apache/spark/pull/5985 + + [SPARK-5726] [MLLIB] Elementwise (Hadamard) Vector Product Transformer + Octavian Geagla , Joseph K. Bradley + 2015-05-07 14:49:55 -0700 + Commit: 658a478, github.com/apache/spark/pull/4580 + + [SPARK-7328] [MLLIB] [PYSPARK] Pyspark.mllib.linalg.Vectors: Missing items + MechCoder + 2015-05-07 14:02:05 -0700 + Commit: 347a329, github.com/apache/spark/pull/5872 + + [SPARK-7347] DAG visualization: add tooltips to RDDs + Andrew Or + 2015-05-07 12:29:56 -0700 + Commit: 88717ee, github.com/apache/spark/pull/5957 + + [SPARK-7391] DAG visualization: auto expand if linked from another viz + Andrew Or + 2015-05-07 12:29:18 -0700 + Commit: f121651, github.com/apache/spark/pull/5958 + + [SPARK-7373] [MESOS] Add docker support for launching drivers in mesos cluster mode. + Timothy Chen + 2015-05-07 12:23:16 -0700 + Commit: 4eecf55, github.com/apache/spark/pull/5917 + + [SPARK-7399] [SPARK CORE] Fixed compilation error in scala 2.11 + Tijo Thomas + 2015-05-07 12:21:09 -0700 + Commit: 0c33bf8, github.com/apache/spark/pull/5966 + + [SPARK-5213] [SQL] Remove the duplicated SparkSQLParser + Cheng Hao + 2015-05-07 12:09:54 -0700 + Commit: 074d75d, github.com/apache/spark/pull/5965 + + [SPARK-7116] [SQL] [PYSPARK] Remove cache() causing memory leak + ksonj + 2015-05-07 12:04:19 -0700 + Commit: dec8f53, github.com/apache/spark/pull/5973 + + [SPARK-1442] [SQL] [FOLLOW-UP] Address minor comments in Window Function PR (#5604). + Yin Huai + 2015-05-07 11:46:49 -0700 + Commit: 5784c8d, github.com/apache/spark/pull/5945 + + [SPARK-6093] [MLLIB] Add RegressionMetrics in PySpark/MLlib + Yanbo Liang + 2015-05-07 11:18:32 -0700 + Commit: 1712a7c, github.com/apache/spark/pull/5941 + + [SPARK-7118] [Python] Add the coalesce Spark SQL function available in PySpark + Olivier Girardot + 2015-05-07 10:58:35 -0700 + Commit: 068c315, github.com/apache/spark/pull/5698 + + [SPARK-7388] [SPARK-7383] wrapper for VectorAssembler in Python + Burak Yavuz , Xiangrui Meng + 2015-05-07 10:25:41 -0700 + Commit: 9e2ffb1, github.com/apache/spark/pull/5930 + + [SPARK-7330] [SQL] avoid NPE at jdbc rdd + Daoyuan Wang + 2015-05-07 10:05:01 -0700 + Commit: ed9be06, github.com/apache/spark/pull/5877 + + [SPARK-7429] [ML] Params cleanups + Joseph K. Bradley + 2015-05-07 01:28:44 -0700 + Commit: 4f87e95, github.com/apache/spark/pull/5960 + + [SPARK-7421] [MLLIB] OnlineLDA cleanups + Joseph K. Bradley + 2015-05-07 01:12:14 -0700 + Commit: 8b6b46e, github.com/apache/spark/pull/5956 + + [SPARK-7035] Encourage __getitem__ over __getattr__ on column access in the Python DataFrame API + ksonj + 2015-05-07 01:02:00 -0700 + Commit: fae4e2d, github.com/apache/spark/pull/5971 + + [SPARK-7295][SQL] bitwise operations for DataFrame DSL + Shiti + 2015-05-07 01:00:29 -0700 + Commit: fa8fddf, github.com/apache/spark/pull/5867 + + [SPARK-7217] [STREAMING] Add configuration to control the default behavior of StreamingContext.stop() implicitly calling SparkContext.stop() + Tathagata Das + 2015-05-07 00:24:44 -0700 + Commit: 01187f5, github.com/apache/spark/pull/5929 + + [SPARK-7430] [STREAMING] [TEST] General improvements to streaming tests to increase debuggability + Tathagata Das + 2015-05-07 00:21:10 -0700 + Commit: cfdadcb, github.com/apache/spark/pull/5961 + + [SPARK-5938] [SPARK-5443] [SQL] Improve JsonRDD performance + Nathan Howell + 2015-05-06 22:56:53 -0700 + Commit: 2d6612c, github.com/apache/spark/pull/5801 + + [SPARK-6812] [SPARKR] filter() on DataFrame does not work as expected. + Sun Rui + 2015-05-06 22:48:16 -0700 + Commit: 9cfa9a5, github.com/apache/spark/pull/5938 + + [SPARK-7432] [MLLIB] disable cv doctest + Xiangrui Meng + 2015-05-06 22:29:07 -0700 + Commit: 773aa25, github.com/apache/spark/pull/5962 + + [SPARK-7405] [STREAMING] Fix the bug that ReceiverInputDStream doesn't report InputInfo + zsxwing + 2015-05-06 18:07:00 -0700 + Commit: 14502d5, github.com/apache/spark/pull/5950 + + [HOT FIX] For DAG visualization #5954 + Andrew Or + 2015-05-06 18:02:08 -0700 + Commit: 71a452b + + [SPARK-7371] [SPARK-7377] [SPARK-7408] DAG visualization addendum (#5729) + Andrew Or + 2015-05-06 17:52:34 -0700 + Commit: 8fa6829, github.com/apache/spark/pull/5954 + + [SPARK-7396] [STREAMING] [EXAMPLE] Update KafkaWordCountProducer to use new Producer API + jerryshao + 2015-05-06 17:44:43 -0700 + Commit: 316a5c0, github.com/apache/spark/pull/5936 + + [SPARK-6799] [SPARKR] Remove SparkR RDD examples, add dataframe examples + Shivaram Venkataraman + 2015-05-06 17:28:11 -0700 + Commit: 4e93042, github.com/apache/spark/pull/5949 + + [HOT FIX] [SPARK-7418] Ignore flaky SparkSubmitUtilsSuite test + Andrew Or + 2015-05-06 17:08:39 -0700 + Commit: fbf1f34 + + [SPARK-5995] [ML] Make Prediction dev API public + Joseph K. Bradley + 2015-05-06 16:15:51 -0700 + Commit: 1ad04da, github.com/apache/spark/pull/5913 + + [HOT-FIX] Move HiveWindowFunctionQuerySuite.scala to hive compatibility dir. + Yin Huai + 2015-05-06 14:48:25 -0700 + Commit: 7740996, github.com/apache/spark/pull/5951 + + Add `Private` annotation. + Josh Rosen + 2015-05-06 11:03:17 -0700 + Commit: 845d1d4 + + [SPARK-7311] Introduce internal Serializer API for determining if serializers support object relocation + Josh Rosen + 2015-05-06 10:52:55 -0700 + Commit: 002c123, github.com/apache/spark/pull/5924 + + [SPARK-1442] [SQL] Window Function Support for Spark SQL + Yin Huai + 2015-05-06 10:43:00 -0700 + Commit: f2c4708, github.com/apache/spark/pull/5604 + + [SPARK-6201] [SQL] promote string and do widen types for IN + Daoyuan Wang + 2015-05-06 10:30:42 -0700 + Commit: c3eb441, github.com/apache/spark/pull/4945 + + [SPARK-5456] [SQL] fix decimal compare for jdbc rdd + Daoyuan Wang + 2015-05-06 10:05:10 -0700 + Commit: 150f671, github.com/apache/spark/pull/5803 + + [SQL] JavaDoc update for various DataFrame functions. + Reynold Xin + 2015-05-06 08:50:56 -0700 + Commit: 322e7e7, github.com/apache/spark/pull/5935 + + [SPARK-6940] [MLLIB] Add CrossValidator to Python ML pipeline API + Xiangrui Meng + 2015-05-06 01:28:43 -0700 + Commit: 32cdc81, github.com/apache/spark/pull/5926 + + [SPARK-7384][Core][Tests] Fix flaky tests for distributed mode in BroadcastSuite + zsxwing + 2015-05-05 23:25:28 -0700 + Commit: 9f019c7, github.com/apache/spark/pull/5925 + + [SPARK-6267] [MLLIB] Python API for IsotonicRegression + Yanbo Liang , Xiangrui Meng + 2015-05-05 22:57:13 -0700 + Commit: 7b14578, github.com/apache/spark/pull/5890 + + [SPARK-7358][SQL] Move DataFrame mathfunctions into functions + Burak Yavuz + 2015-05-05 22:56:01 -0700 + Commit: ba2b566, github.com/apache/spark/pull/5923 + + [SPARK-6841] [SPARKR] add support for mean, median, stdev etc. + qhuang + 2015-05-05 20:39:56 -0700 + Commit: a466944, github.com/apache/spark/pull/5446 + + Revert "[SPARK-3454] separate json endpoints for data in the UI" + Reynold Xin + 2015-05-05 19:27:30 -0700 + Commit: 51b3d41 + + [SPARK-6231][SQL/DF] Automatically resolve join condition ambiguity for self-joins. + Reynold Xin + 2015-05-05 18:59:46 -0700 + Commit: 1fd31ba, github.com/apache/spark/pull/5919 + + Some minor cleanup after SPARK-4550. + Sandy Ryza + 2015-05-05 18:32:16 -0700 + Commit: 0092abb, github.com/apache/spark/pull/5916 + + [SPARK-7230] [SPARKR] Make RDD private in SparkR. + Shivaram Venkataraman + 2015-05-05 14:40:33 -0700 + Commit: c688e3c, github.com/apache/spark/pull/5895 + + [SQL][Minor] make StringComparison extends ExpectsInputTypes + wangfei + 2015-05-05 14:24:37 -0700 + Commit: 3059291, github.com/apache/spark/pull/5905 + + [SPARK-7351] [STREAMING] [DOCS] Add spark.streaming.ui.retainedBatches to docs + zsxwing + 2015-05-05 13:42:23 -0700 + Commit: fec7b29, github.com/apache/spark/pull/5899 + + [SPARK-7294][SQL] ADD BETWEEN + 云峤 , kaka1992 + 2015-05-05 13:23:53 -0700 + Commit: 735bc3d, github.com/apache/spark/pull/5839 + + [SPARK-6939] [STREAMING] [WEBUI] Add timeline and histogram graphs for streaming statistics + zsxwing + 2015-05-05 12:52:16 -0700 + Commit: 489700c, github.com/apache/spark/pull/5533 + + [SPARK-5888] [MLLIB] Add OneHotEncoder as a Transformer + Sandy Ryza + 2015-05-05 12:34:02 -0700 + Commit: 47728db, github.com/apache/spark/pull/5500 + + [SPARK-7333] [MLLIB] Add BinaryClassificationEvaluator to PySpark + Xiangrui Meng + 2015-05-05 11:45:37 -0700 + Commit: ee374e8, github.com/apache/spark/pull/5885 + + [SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames + Burak Yavuz + 2015-05-05 11:01:25 -0700 + Commit: 18340d7, github.com/apache/spark/pull/5900 + + [SPARK-7007] [CORE] Add a metric source for ExecutorAllocationManager + jerryshao + 2015-05-05 09:43:49 -0700 + Commit: 9f1f9b1, github.com/apache/spark/pull/5589 + + [SPARK-7318] [STREAMING] DStream cleans objects that are not closures + Andrew Or + 2015-05-05 09:37:49 -0700 + Commit: 57e9f29, github.com/apache/spark/pull/5860 + + [SPARK-7237] Many user provided closures are not actually cleaned + Andrew Or + 2015-05-05 09:37:04 -0700 + Commit: 1fdabf8, github.com/apache/spark/pull/5787 + + [MLLIB] [TREE] Verify size of input rdd > 0 when building meta data + Alain , aihe@usc.edu + 2015-05-05 16:47:34 +0100 + Commit: d4cb38a, github.com/apache/spark/pull/5810 + + Closes #5591 Closes #5878 + Xiangrui Meng + 2015-05-05 08:00:31 -0700 + Commit: 9d250e6 + + [SPARK-6612] [MLLIB] [PYSPARK] Python KMeans parity + Hrishikesh Subramonian + 2015-05-05 07:57:39 -0700 + Commit: 5995ada, github.com/apache/spark/pull/5647 + + [SPARK-7202] [MLLIB] [PYSPARK] Add SparseMatrixPickler to SerDe + MechCoder + 2015-05-05 07:53:11 -0700 + Commit: 5ab652c, github.com/apache/spark/pull/5775 + + [SPARK-7350] [STREAMING] [WEBUI] Attach the Streaming tab when calling ssc.start() + zsxwing + 2015-05-05 15:09:58 +0100 + Commit: c6d1efb, github.com/apache/spark/pull/5898 + + [SPARK-5074] [CORE] [TESTS] Fix the flakey test 'run shuffle with map stage failure' in DAGSchedulerSuite + zsxwing + 2015-05-05 15:04:14 +0100 + Commit: 5ffc73e, github.com/apache/spark/pull/5903 + + [MINOR] Minor update for document + Liang-Chi Hsieh + 2015-05-05 14:44:02 +0100 + Commit: b83091a, github.com/apache/spark/pull/5906 + + [SPARK-3454] separate json endpoints for data in the UI + Imran Rashid + 2015-05-05 07:25:40 -0500 + Commit: d497358, github.com/apache/spark/pull/4435 + + [SPARK-7357] Improving HBaseTest example + Jihong MA + 2015-05-05 12:40:41 +0100 + Commit: 51f4620, github.com/apache/spark/pull/5904 + + [SPARK-5112] Expose SizeEstimator as a developer api + Sandy Ryza + 2015-05-05 12:38:46 +0100 + Commit: 4222da6, github.com/apache/spark/pull/3913 + + [SPARK-6653] [YARN] New config to specify port for sparkYarnAM actor system + shekhar.bansal + 2015-05-05 11:09:51 +0100 + Commit: fc8feaa, github.com/apache/spark/pull/5719 + + [SPARK-7341] [STREAMING] [TESTS] Fix the flaky test: org.apache.spark.stre... + zsxwing + 2015-05-05 02:15:39 -0700 + Commit: 4d29867, github.com/apache/spark/pull/5891 + + [SPARK-7113] [STREAMING] Support input information reporting for Direct Kafka stream + jerryshao + 2015-05-05 02:01:06 -0700 + Commit: 8436f7e, github.com/apache/spark/pull/5879 + + [HOTFIX] [TEST] Ignoring flaky tests + Tathagata Das + 2015-05-05 01:58:51 -0700 + Commit: 8776fe0, github.com/apache/spark/pull/5901 + + [SPARK-7139] [STREAMING] Allow received block metadata to be saved to WAL and recovered on driver failure + Tathagata Das + 2015-05-05 01:45:19 -0700 + Commit: 1854ac3, github.com/apache/spark/pull/5732 + + [MINOR] [BUILD] Declare ivy dependency in root pom. + Marcelo Vanzin + 2015-05-05 08:56:16 +0100 + Commit: c5790a2, github.com/apache/spark/pull/5893 + + [MINOR] Renamed variables in SparkKMeans.scala, LocalKMeans.scala and kmeans.py to simplify readability + Niccolo Becchi , pippobaudos + 2015-05-05 08:54:42 +0100 + Commit: da738cf, github.com/apache/spark/pull/5875 + + [SPARK-7314] [SPARK-3524] [PYSPARK] upgrade Pyrolite to 4.4 + Xiangrui Meng + 2015-05-04 23:52:42 -0700 + Commit: e9b16e6, github.com/apache/spark/pull/5850 + + [SPARK-7236] [CORE] Fix to prevent AkkaUtils askWithReply from sleeping on final attempt + Bryan Cutler + 2015-05-04 18:29:22 -0700 + Commit: 8aa5aea, github.com/apache/spark/pull/5896 + + [SPARK-7266] Add ExpectsInputTypes to expressions when possible. + Reynold Xin + 2015-05-04 18:03:07 -0700 + Commit: 678c4da, github.com/apache/spark/pull/5796 + + [SPARK-7243][SQL] Contingency Tables for DataFrames + Burak Yavuz + 2015-05-04 17:02:49 -0700 + Commit: 8055411, github.com/apache/spark/pull/5842 + + [SPARK-6943] [SPARK-6944] DAG visualization on SparkUI + Andrew Or + 2015-05-04 16:21:36 -0700 + Commit: fc8b581, github.com/apache/spark/pull/5729 + + [SPARK-7319][SQL] Improve the output from DataFrame.show() + 云峤 + 2015-05-04 12:08:38 -0700 + Commit: f32e69e, github.com/apache/spark/pull/5865 + + [SPARK-5956] [MLLIB] Pipeline components should be copyable. + Xiangrui Meng + 2015-05-04 11:28:59 -0700 + Commit: e0833c5, github.com/apache/spark/pull/5820 + + [MINOR] Fix python test typo? + Andrew Or + 2015-05-04 17:17:55 +0100 + Commit: 5a1a107, github.com/apache/spark/pull/5883 + + +Release 1.4.0 + + [HOTFIX] Revert "[SPARK-7092] Update spark scala version to 2.11.6" + Patrick Wendell + 2015-05-19 02:28:41 -0700 + Commit: 31f5d53 + + Revert "Preparing Spark release v1.4.0-rc1" + Patrick Wendell + 2015-05-19 02:27:14 -0700 + Commit: 586ede6 + + Revert "Preparing development version 1.4.1-SNAPSHOT" + Patrick Wendell + 2015-05-19 02:27:07 -0700 + Commit: e7309ec + + Fixing a few basic typos in the Programming Guide. + Mike Dusenberry + 2015-05-19 08:59:45 +0100 + Commit: 0748263, github.com/apache/spark/pull/6240 + + Preparing development version 1.4.1-SNAPSHOT + Patrick Wendell + 2015-05-19 07:13:24 +0000 + Commit: a1d896b + + Preparing Spark release v1.4.0-rc1 + Patrick Wendell + 2015-05-19 07:13:24 +0000 + Commit: 79fb01a + + Updating CHANGES.txt for Spark 1.4 + Patrick Wendell + 2015-05-19 00:12:20 -0700 + Commit: 30bf333 + + Revert "Preparing Spark release v1.4.0-rc1" + Patrick Wendell + 2015-05-19 00:10:39 -0700 + Commit: b0c63d2 + + Revert "Preparing development version 1.4.1-SNAPSHOT" + Patrick Wendell + 2015-05-19 00:10:37 -0700 + Commit: 198a186 + + [SPARK-7581] [ML] [DOC] User guide for spark.ml PolynomialExpansion + Xusen Yin + 2015-05-19 00:06:33 -0700 + Commit: 38a3fc8, github.com/apache/spark/pull/6113 + + [HOTFIX] Fixing style failures in Kinesis source + Patrick Wendell + 2015-05-19 00:02:06 -0700 + Commit: de60c2e + + Preparing development version 1.4.1-SNAPSHOT + Patrick Wendell + 2015-05-19 06:06:41 +0000 + Commit: 40190ce + + Preparing Spark release v1.4.0-rc1 + Patrick Wendell + 2015-05-19 06:06:40 +0000 + Commit: 38ccef3 + + Revert "Preparing Spark release v1.4.0-rc1" + Patrick Wendell + 2015-05-18 23:06:15 -0700 + Commit: 152b029 + + Revert "Preparing development version 1.4.1-SNAPSHOT" + Patrick Wendell + 2015-05-18 23:06:13 -0700 + Commit: 4d098bc + + [HOTFIX]: Java 6 Build Breaks + Patrick Wendell + 2015-05-19 06:00:13 +0000 + Commit: be1fc93 + + Preparing development version 1.4.1-SNAPSHOT + Patrick Wendell + 2015-05-19 05:01:11 +0000 + Commit: 758ca74 + + Preparing Spark release v1.4.0-rc1 + Patrick Wendell + 2015-05-19 05:01:11 +0000 + Commit: e8e97e3 + + [SPARK-7687] [SQL] DataFrame.describe() should cast all aggregates to String + Josh Rosen + 2015-05-18 21:53:44 -0700 + Commit: 99436bd, github.com/apache/spark/pull/6218 + + CHANGES.txt and changelist updaets for Spark 1.4. + Patrick Wendell + 2015-05-18 21:44:13 -0700 + Commit: 914ecd0 + + [SPARK-7150] SparkContext.range() and SQLContext.range() + Daoyuan Wang , Davies Liu + 2015-05-18 21:43:12 -0700 + Commit: 7fcbb2c, github.com/apache/spark/pull/6081 + + Version updates for Spark 1.4.0 + Patrick Wendell + 2015-05-18 21:38:37 -0700 + Commit: 9d0b7fb + + [SPARK-7681] [MLLIB] Add SparseVector support for gemv + Liang-Chi Hsieh + 2015-05-18 21:32:36 -0700 + Commit: dd9f873, github.com/apache/spark/pull/6209 + + [SPARK-7692] Updated Kinesis examples + Tathagata Das + 2015-05-18 18:24:15 -0700 + Commit: 9c48548, github.com/apache/spark/pull/6249 + + [SPARK-7621] [STREAMING] Report Kafka errors to StreamingListeners + jerluc + 2015-05-18 18:13:29 -0700 + Commit: 9188ad8, github.com/apache/spark/pull/6204 + + [SPARK-7624] Revert #4147 + Davies Liu + 2015-05-18 16:55:45 -0700 + Commit: 60cb33d, github.com/apache/spark/pull/6172 + + [SQL] Fix serializability of ORC table scan + Michael Armbrust + 2015-05-18 15:24:31 -0700 + Commit: f8f23c4, github.com/apache/spark/pull/6247 + + [SPARK-7501] [STREAMING] DAG visualization: show DStream operations + Andrew Or + 2015-05-18 14:33:33 -0700 + Commit: a475cbc, github.com/apache/spark/pull/6034 + + [HOTFIX] Fix ORC build break + Michael Armbrust + 2015-05-18 14:04:04 -0700 + Commit: ba502ab, github.com/apache/spark/pull/6244 + + [SPARK-7658] [STREAMING] [WEBUI] Update the mouse behaviors for the timeline graphs + zsxwing + 2015-05-18 13:34:43 -0700 + Commit: 39add3d, github.com/apache/spark/pull/6168 + + [SPARK-6216] [PYSPARK] check python version of worker with driver + Davies Liu + 2015-05-18 12:55:13 -0700 + Commit: a833209, github.com/apache/spark/pull/6203 + + [SPARK-7673] [SQL] WIP: HadoopFsRelation and ParquetRelation2 performance optimizations + Cheng Lian + 2015-05-18 12:45:37 -0700 + Commit: 3962348, github.com/apache/spark/pull/6225 + + [SPARK-7567] [SQL] [follow-up] Use a new flag to set output committer based on mapreduce apis + Yin Huai + 2015-05-18 12:17:10 -0700 + Commit: a385f4b, github.com/apache/spark/pull/6130 + + [SPARK-7269] [SQL] Incorrect analysis for aggregation(use semanticEquals) + Wenchen Fan + 2015-05-18 12:08:28 -0700 + Commit: d6f5f37, github.com/apache/spark/pull/6173 + + [SPARK-7631] [SQL] treenode argString should not print children + scwf + 2015-05-18 12:05:14 -0700 + Commit: dbd4ec8, github.com/apache/spark/pull/6144 + + [SPARK-2883] [SQL] ORC data source for Spark SQL + Zhan Zhang , Cheng Lian + 2015-05-18 12:03:27 -0700 + Commit: 65d71bd, github.com/apache/spark/pull/6194 + + [SPARK-7380] [MLLIB] pipeline stages should be copyable in Python + Xiangrui Meng , Joseph K. Bradley + 2015-05-18 12:02:18 -0700 + Commit: cf4e04a, github.com/apache/spark/pull/6088 + + [SQL] [MINOR] [THIS] use private for internal field in ScalaUdf + Wenchen Fan + 2015-05-18 12:01:30 -0700 + Commit: 7d44c01, github.com/apache/spark/pull/6235 + + [SPARK-7570] [SQL] Ignores _temporary during partition discovery + Cheng Lian + 2015-05-18 11:59:44 -0700 + Commit: c7623a2, github.com/apache/spark/pull/6091 + + [SPARK-6888] [SQL] Make the jdbc driver handling user-definable + Rene Treffer + 2015-05-18 11:55:36 -0700 + Commit: b41301a, github.com/apache/spark/pull/5555 + + [SPARK-7627] [SPARK-7472] DAG visualization: style skipped stages + Andrew Or + 2015-05-18 10:59:35 -0700 + Commit: a0ae8ce, github.com/apache/spark/pull/6171 + + [SPARK-7272] [MLLIB] User guide for PMML model export + Vincenzo Selvaggio + 2015-05-18 08:46:33 -0700 + Commit: a95d4e1, github.com/apache/spark/pull/6219 + + [SPARK-6657] [PYSPARK] Fix doc warnings + Xiangrui Meng + 2015-05-18 08:35:14 -0700 + Commit: 2c94ffe, github.com/apache/spark/pull/6221 + + [SPARK-7299][SQL] Set precision and scale for Decimal according to JDBC metadata instead of returned BigDecimal + Liang-Chi Hsieh + 2015-05-18 01:10:55 -0700 + Commit: 0e7cd8f, github.com/apache/spark/pull/5833 + + [SPARK-7694] [MLLIB] Use getOrElse for getting the threshold of LR model + Shuo Xiang + 2015-05-17 21:16:52 -0700 + Commit: 0b6bc8a, github.com/apache/spark/pull/6224 + + [SPARK-7693][Core] Remove "import scala.concurrent.ExecutionContext.Implicits.global" + zsxwing + 2015-05-17 20:37:19 -0700 + Commit: 2a42d2d, github.com/apache/spark/pull/6223 + + [SQL] [MINOR] use catalyst type converter in ScalaUdf + Wenchen Fan + 2015-05-17 16:51:57 -0700 + Commit: be66d19, github.com/apache/spark/pull/6182 + + [SPARK-6514] [SPARK-5960] [SPARK-6656] [SPARK-7679] [STREAMING] [KINESIS] Updates to the Kinesis API + Tathagata Das + 2015-05-17 16:49:07 -0700 + Commit: e0632ff, github.com/apache/spark/pull/6147 + + [SPARK-7491] [SQL] Allow configuration of classloader isolation for hive + Michael Armbrust + 2015-05-17 12:43:15 -0700 + Commit: a855608, github.com/apache/spark/pull/6167 + + [SPARK-7686] [SQL] DescribeCommand is assigned wrong output attributes in SparkStrategies + Josh Rosen + 2015-05-17 11:59:28 -0700 + Commit: 53d6ab5, github.com/apache/spark/pull/6217 + + [SPARK-7660] Wrap SnappyOutputStream to work around snappy-java bug + Josh Rosen + 2015-05-17 09:30:49 -0700 + Commit: 6df71eb, github.com/apache/spark/pull/6176 + + [SPARK-7669] Builds against Hadoop 2.6+ get inconsistent curator depend… + Steve Loughran + 2015-05-17 17:03:11 +0100 + Commit: 0feb3de, github.com/apache/spark/pull/6191 + + [SPARK-7447] [SQL] Don't re-merge Parquet schema when the relation is deserialized + Liang-Chi Hsieh + 2015-05-17 15:42:21 +0800 + Commit: 898be62, github.com/apache/spark/pull/6012 + + [MINOR] Add 1.3, 1.3.1 to master branch EC2 scripts + Shivaram Venkataraman + 2015-05-17 00:12:20 -0700 + Commit: 0ed376a, github.com/apache/spark/pull/6215 + + [MINOR] [SQL] Removes an unreachable case clause + Cheng Lian + 2015-05-16 23:20:09 -0700 + Commit: 671a6bc, github.com/apache/spark/pull/6214 + + [SPARK-7654][SQL] Move JDBC into DataFrame's reader/writer interface. + Reynold Xin + 2015-05-16 22:01:53 -0700 + Commit: 17e0786, github.com/apache/spark/pull/6210 + + [SPARK-7655][Core] Deserializing value should not hold the TaskSchedulerImpl lock + zsxwing + 2015-05-16 21:03:22 -0700 + Commit: 8494910, github.com/apache/spark/pull/6195 + + [SPARK-7654][MLlib] Migrate MLlib to the DataFrame reader/writer API. + Reynold Xin + 2015-05-16 15:03:57 -0700 + Commit: bd057f8, github.com/apache/spark/pull/6211 + + [BUILD] update jblas dependency version to 1.2.4 + Matthew Brandyberry + 2015-05-16 18:17:48 +0100 + Commit: 8bde352, github.com/apache/spark/pull/6199 + + [HOTFIX] [SQL] Fixes DataFrameWriter.mode(String) + Cheng Lian + 2015-05-16 20:55:10 +0800 + Commit: 856619d, github.com/apache/spark/pull/6212 + + [SPARK-7655][Core][SQL] Remove 'scala.concurrent.ExecutionContext.Implicits.global' in 'ask' and 'BroadcastHashJoin' + zsxwing + 2015-05-16 00:44:29 -0700 + Commit: ad5b0b1, github.com/apache/spark/pull/6200 + + [SPARK-7672] [CORE] Use int conversion in translating kryoserializer.buffer.mb to kryoserializer.buffer + Nishkam Ravi , nishkamravi2 , nravi + 2015-05-16 08:24:21 +0100 + Commit: e7607e5, github.com/apache/spark/pull/6198 + + [SPARK-4556] [BUILD] binary distribution assembly can't run in local mode + Sean Owen + 2015-05-16 08:18:41 +0100 + Commit: 1fc3560, github.com/apache/spark/pull/6186 + + [SPARK-7671] Fix wrong URLs in MLlib Data Types Documentation + FavioVazquez + 2015-05-16 08:07:03 +0100 + Commit: 7e3f9fe, github.com/apache/spark/pull/6196 + + [SPARK-7654][SQL] DataFrameReader and DataFrameWriter for input/output API + Reynold Xin + 2015-05-15 22:00:31 -0700 + Commit: 9da55b5, github.com/apache/spark/pull/6175 + + [SPARK-7473] [MLLIB] Add reservoir sample in RandomForest + AiHe + 2015-05-15 20:42:35 -0700 + Commit: f41be8f, github.com/apache/spark/pull/5988 + + [SPARK-7543] [SQL] [PySpark] split dataframe.py into multiple files + Davies Liu + 2015-05-15 20:09:15 -0700 + Commit: 8164fbc, github.com/apache/spark/pull/6201 + + [SPARK-7073] [SQL] [PySpark] Clean up SQL data type hierarchy in Python + Davies Liu + 2015-05-15 20:05:26 -0700 + Commit: 61806f6, github.com/apache/spark/pull/6206 + + [SPARK-7575] [ML] [DOC] Example code for OneVsRest + Ram Sriharsha + 2015-05-15 19:33:20 -0700 + Commit: 04323ba, github.com/apache/spark/pull/6115 + + [SPARK-7563] OutputCommitCoordinator.stop() should only run on the driver + Josh Rosen + 2015-05-15 18:06:01 -0700 + Commit: ed75cc0, github.com/apache/spark/pull/6197 + + [SPARK-7676] Bug fix and cleanup of stage timeline view + Kay Ousterhout + 2015-05-15 17:45:14 -0700 + Commit: 6f78d03, github.com/apache/spark/pull/6202 + + [SPARK-7556] [ML] [DOC] Add user guide for spark.ml Binarizer, including Scala, Java and Python examples + Liang-Chi Hsieh + 2015-05-15 15:05:04 -0700 + Commit: e847d86, github.com/apache/spark/pull/6116 + + [SPARK-7677] [STREAMING] Add Kafka modules to the 2.11 build. + Iulian Dragos + 2015-05-15 14:57:29 -0700 + Commit: 31e6404, github.com/apache/spark/pull/6149 + + [SPARK-7226] [SPARKR] Support math functions in R DataFrame + qhuang + 2015-05-15 14:06:16 -0700 + Commit: 9ef6d74, github.com/apache/spark/pull/6170 + + [SPARK-7296] Add timeline visualization for stages in the UI. + Kousuke Saruta + 2015-05-15 13:54:09 -0700 + Commit: a5f7b3b, github.com/apache/spark/pull/5843 + + [SPARK-7504] [YARN] NullPointerException when initializing SparkContext in YARN-cluster mode + ehnalis + 2015-05-15 12:14:02 -0700 + Commit: 7dc0ff3, github.com/apache/spark/pull/6083 + + [SPARK-7664] [WEBUI] DAG visualization: Fix incorrect link paths of DAG. + Kousuke Saruta + 2015-05-15 11:54:13 -0700 + Commit: e319719, github.com/apache/spark/pull/6184 + + [SPARK-5412] [DEPLOY] Cannot bind Master to a specific hostname as per the documentation + Sean Owen + 2015-05-15 11:30:19 -0700 + Commit: fe3c734, github.com/apache/spark/pull/6185 + + [CORE] Protect additional test vars from early GC + Tim Ellison + 2015-05-15 11:27:24 -0700 + Commit: 866e4b5, github.com/apache/spark/pull/6187 + + [SPARK-7233] [CORE] Detect REPL mode once + Oleksii Kostyliev , Oleksii Kostyliev + 2015-05-15 11:19:56 -0700 + Commit: c58b9c6, github.com/apache/spark/pull/5835 + + [SPARK-7651] [MLLIB] [PYSPARK] GMM predict, predictSoft should raise error on bad input + FlytxtRnD + 2015-05-15 10:43:18 -0700 + Commit: dfdae58, github.com/apache/spark/pull/6180 + + [SPARK-7668] [MLLIB] Preserve isTransposed property for Matrix after calling map function + Liang-Chi Hsieh + 2015-05-15 10:03:29 -0700 + Commit: d1f5651, github.com/apache/spark/pull/6188 + + [SPARK-7503] [YARN] Resources in .sparkStaging directory can't be cleaned up on error + Kousuke Saruta + 2015-05-15 11:37:34 +0100 + Commit: a17a0ee, github.com/apache/spark/pull/6026 + + [SPARK-7591] [SQL] Partitioning support API tweaks + Cheng Lian + 2015-05-15 16:20:49 +0800 + Commit: bcb2c5d, github.com/apache/spark/pull/6150 + + [SPARK-6258] [MLLIB] GaussianMixture Python API parity check + Yanbo Liang + 2015-05-15 00:18:39 -0700 + Commit: c0bb974, github.com/apache/spark/pull/6087 + + [SPARK-7650] [STREAMING] [WEBUI] Move streaming css and js files to the streaming project + zsxwing + 2015-05-14 23:51:41 -0700 + Commit: 0ba99f0, github.com/apache/spark/pull/6160 + + [CORE] Remove unreachable Heartbeat message from Worker + Kan Zhang + 2015-05-14 23:50:50 -0700 + Commit: 6742b4e, github.com/apache/spark/pull/6163 + + [HOTFIX] Add workaround for SPARK-7660 to fix JavaAPISuite failures. + Josh Rosen + 2015-05-14 23:17:41 -0700 + Commit: 1206a55 + + [SQL] When creating partitioned table scan, explicitly create UnionRDD. + Yin Huai + 2015-05-15 12:04:26 +0800 + Commit: 7aa269f, github.com/apache/spark/pull/6162 + + [SPARK-7098][SQL] Make the WHERE clause with timestamp show consistent result + Liang-Chi Hsieh + 2015-05-14 20:49:21 -0700 + Commit: bac4522, github.com/apache/spark/pull/5682 + + [SPARK-7548] [SQL] Add explode function for DataFrames + Michael Armbrust + 2015-05-14 19:49:44 -0700 + Commit: 778a054, github.com/apache/spark/pull/6107 + + [SPARK-7619] [PYTHON] fix docstring signature + Xiangrui Meng + 2015-05-14 18:16:22 -0700 + Commit: a238c23, github.com/apache/spark/pull/6161 + + [SPARK-7648] [MLLIB] Add weights and intercept to GLM wrappers in spark.ml + Xiangrui Meng + 2015-05-14 18:13:58 -0700 + Commit: f91bb57, github.com/apache/spark/pull/6156 + + [SPARK-7645] [STREAMING] [WEBUI] Show milliseconds in the UI if the batch interval < 1 second + zsxwing + 2015-05-14 16:58:36 -0700 + Commit: 79983f1, github.com/apache/spark/pull/6154 + + [SPARK-7649] [STREAMING] [WEBUI] Use window.localStorage to store the status rather than the url + zsxwing + 2015-05-14 16:57:33 -0700 + Commit: 3358485, github.com/apache/spark/pull/6158 + + [SPARK-7643] [UI] use the correct size in RDDPage for storage info and partitions + Xiangrui Meng + 2015-05-14 16:56:32 -0700 + Commit: 8d8876d, github.com/apache/spark/pull/6157 + + [SPARK-7598] [DEPLOY] Add aliveWorkers metrics in Master + Rex Xiong + 2015-05-14 16:55:31 -0700 + Commit: 894214f, github.com/apache/spark/pull/6117 + + Make SPARK prefix a variable + tedyu + 2015-05-14 15:26:35 -0700 + Commit: fceaffc, github.com/apache/spark/pull/6153 + + [SPARK-7278] [PySpark] DateType should find datetime.datetime acceptable + ksonj + 2015-05-14 15:10:58 -0700 + Commit: a49a145, github.com/apache/spark/pull/6057 + + [SQL][minor] rename apply for QueryPlanner + Wenchen Fan + 2015-05-14 10:25:18 -0700 + Commit: aa8a0f9, github.com/apache/spark/pull/6142 + + [SPARK-7249] Updated Hadoop dependencies due to inconsistency in the versions + FavioVazquez + 2015-05-14 15:22:58 +0100 + Commit: 67ed0aa, github.com/apache/spark/pull/5786 + + [SPARK-7568] [ML] ml.LogisticRegression doesn't output the right prediction + DB Tsai + 2015-05-14 01:26:08 -0700 + Commit: 58534b0, github.com/apache/spark/pull/6109 + + [SPARK-7407] [MLLIB] use uid + name to identify parameters + Xiangrui Meng + 2015-05-14 01:22:15 -0700 + Commit: e45cd9f, github.com/apache/spark/pull/6019 + + [SPARK-7595] [SQL] Window will cause resolve failed with self join + linweizhong + 2015-05-14 00:23:27 -0700 + Commit: c80e0cf, github.com/apache/spark/pull/6114 + + [SPARK-7620] [ML] [MLLIB] Removed calling size, length in while condition to avoid extra JVM call + DB Tsai + 2015-05-13 22:23:21 -0700 + Commit: 9ab4db2, github.com/apache/spark/pull/6137 + + [SPARK-7612] [MLLIB] update NB training to use mllib's BLAS + Xiangrui Meng + 2015-05-13 21:27:17 -0700 + Commit: 82f387f, github.com/apache/spark/pull/6128 + + [HOT FIX #6125] Do not wait for all stages to start rendering + Andrew Or + 2015-05-13 21:04:13 -0700 + Commit: 2d4a961, github.com/apache/spark/pull/6138 + + [HOTFIX] Use 'new Job' in fsBasedParquet.scala + zsxwing + 2015-05-13 17:58:29 -0700 + Commit: d518c03, github.com/apache/spark/pull/6136 + + [SPARK-6752] [STREAMING] [REVISED] Allow StreamingContext to be recreated from checkpoint and existing SparkContext + Tathagata Das + 2015-05-13 17:33:15 -0700 + Commit: aec8394, github.com/apache/spark/pull/6096 + + [SPARK-7601] [SQL] Support Insert into JDBC Datasource + Venkata Ramana Gollamudi + 2015-05-13 17:24:04 -0700 + Commit: 820aaa6, github.com/apache/spark/pull/6121 + + [SPARK-7081] Faster sort-based shuffle path using binary processing cache-aware sort + Josh Rosen + 2015-05-13 17:07:31 -0700 + Commit: c53ebea, github.com/apache/spark/pull/5868 + + [SPARK-7356] [STREAMING] Fix flakey tests in FlumePollingStreamSuite using SparkSink's batch CountDownLatch. + Hari Shreedharan + 2015-05-13 16:43:30 -0700 + Commit: 6c0644a, github.com/apache/spark/pull/5918 + + [STREAMING] [MINOR] Keep streaming.UIUtils private + Andrew Or + 2015-05-13 16:31:24 -0700 + Commit: e499a1e, github.com/apache/spark/pull/6134 + + [SPARK-7502] DAG visualization: gracefully handle removed stages + Andrew Or + 2015-05-13 16:29:52 -0700 + Commit: 895d46a, github.com/apache/spark/pull/6132 + + [SPARK-7464] DAG visualization: highlight the same RDDs on hover + Andrew Or + 2015-05-13 16:29:10 -0700 + Commit: 4b4f10b, github.com/apache/spark/pull/6100 + + [SPARK-7399] Spark compilation error for scala 2.11 + Andrew Or + 2015-05-13 16:28:37 -0700 + Commit: e6b8cef, github.com/apache/spark/pull/6129 + + [SPARK-7608] Clean up old state in RDDOperationGraphListener + Andrew Or + 2015-05-13 16:27:48 -0700 + Commit: ec34230, github.com/apache/spark/pull/6125 + + [SQL] Move some classes into packages that are more appropriate. + Reynold Xin + 2015-05-13 16:15:31 -0700 + Commit: acd872b, github.com/apache/spark/pull/6108 + + [SPARK-7303] [SQL] push down project if possible when the child is sort + scwf + 2015-05-13 16:13:48 -0700 + Commit: d5c52d9, github.com/apache/spark/pull/5838 + + [SPARK-7382] [MLLIB] Feature Parity in PySpark for ml.classification + Burak Yavuz + 2015-05-13 15:13:09 -0700 + Commit: 51230f2, github.com/apache/spark/pull/6106 + + [SPARK-7545] [MLLIB] Added check in Bernoulli Naive Bayes to make sure that both training and predict features have values of 0 or 1 + leahmcguire + 2015-05-13 14:13:19 -0700 + Commit: d9fb905, github.com/apache/spark/pull/6073 + + [SPARK-7593] [ML] Python Api for ml.feature.Bucketizer + Burak Yavuz + 2015-05-13 13:21:36 -0700 + Commit: 11911b0, github.com/apache/spark/pull/6124 + + [SPARK-7551][DataFrame] support backticks for DataFrame attribute resolution + Wenchen Fan + 2015-05-13 12:47:48 -0700 + Commit: 3a60bcb, github.com/apache/spark/pull/6074 + + [SPARK-7567] [SQL] Migrating Parquet data source to FSBasedRelation + Cheng Lian + 2015-05-13 11:04:10 -0700 + Commit: 90f304b, github.com/apache/spark/pull/6090 + + [SPARK-7589] [STREAMING] [WEBUI] Make "Input Rate" in the Streaming page consistent with other pages + zsxwing + 2015-05-13 10:01:26 -0700 + Commit: 10007fb, github.com/apache/spark/pull/6102 + + [SPARK-6734] [SQL] Add UDTF.close support in Generate + Cheng Hao + 2015-05-14 00:14:59 +0800 + Commit: 42cf4a2, github.com/apache/spark/pull/5383 + + [MINOR] [SQL] Removes debugging println + Cheng Lian + 2015-05-13 23:40:13 +0800 + Commit: d78f0e1, github.com/apache/spark/pull/6123 + + [SQL] In InsertIntoFSBasedRelation.insert, log cause before abort job/task. + Yin Huai + 2015-05-13 23:36:19 +0800 + Commit: 9ca28d9, github.com/apache/spark/pull/6105 + + [SPARK-7599] [SQL] Don't restrict customized output committers to be subclasses of FileOutputCommitter + Cheng Lian + 2015-05-13 07:35:55 -0700 + Commit: cb1fe81, github.com/apache/spark/pull/6118 + + [SPARK-6568] spark-shell.cmd --jars option does not accept the jar that has space in its path + Masayoshi TSUZUKI , Kousuke Saruta + 2015-05-13 09:43:40 +0100 + Commit: bfdecac, github.com/apache/spark/pull/5447 + + [SPARK-7526] [SPARKR] Specify ip of RBackend, MonitorServer and RRDD Socket server + linweizhong + 2015-05-12 23:55:44 -0700 + Commit: 7bd5274, github.com/apache/spark/pull/6053 + + [SPARK-7482] [SPARKR] Rename some DataFrame API methods in SparkR to match their counterparts in Scala. + Sun Rui + 2015-05-12 23:52:30 -0700 + Commit: b18f1c6, github.com/apache/spark/pull/6007 + + [SPARK-7566][SQL] Add type to HiveContext.analyzer + Santiago M. Mola + 2015-05-12 23:44:21 -0700 + Commit: 6ff3379, github.com/apache/spark/pull/6086 + + [SPARK-7321][SQL] Add Column expression for conditional statements (when/otherwise) + Reynold Xin , kaka1992 + 2015-05-12 21:43:34 -0700 + Commit: 219a904, github.com/apache/spark/pull/6072 + + [SPARK-7588] Document all SQL/DataFrame public methods with @since tag + Reynold Xin + 2015-05-12 18:37:02 -0700 + Commit: bdd5db9, github.com/apache/spark/pull/6101 + + [HOTFIX] Use the old Job API to support old Hadoop versions + zsxwing + 2015-05-13 08:33:24 +0800 + Commit: 2cc3301, github.com/apache/spark/pull/6095 + + [SPARK-7572] [MLLIB] do not import Param/Params under pyspark.ml + Xiangrui Meng + 2015-05-12 17:15:39 -0700 + Commit: 08ec1af, github.com/apache/spark/pull/6094 + + [SPARK-7554] [STREAMING] Throw exception when an active/stopped StreamingContext is used to create DStreams and output operations + Tathagata Das + 2015-05-12 17:07:21 -0700 + Commit: bb81b15, github.com/apache/spark/pull/6099 + + [SPARK-7528] [MLLIB] make RankingMetrics Java-friendly + Xiangrui Meng + 2015-05-12 16:53:47 -0700 + Commit: 6c292a2, github.com/apache/spark/pull/6098 + + [SPARK-7553] [STREAMING] Added methods to maintain a singleton StreamingContext + Tathagata Das + 2015-05-12 16:44:14 -0700 + Commit: 91fbd93, github.com/apache/spark/pull/6070 + + [SPARK-7573] [ML] OneVsRest cleanups + Joseph K. Bradley + 2015-05-12 16:42:30 -0700 + Commit: 612247f, github.com/apache/spark/pull/6097 + + [SPARK-7557] [ML] [DOC] User guide for spark.ml HashingTF, Tokenizer + Joseph K. Bradley + 2015-05-12 16:39:56 -0700 + Commit: d080df1, github.com/apache/spark/pull/6093 + + [SPARK-7496] [MLLIB] Update Programming guide with Online LDA + Yuhao Yang + 2015-05-12 15:12:29 -0700 + Commit: fe34a59, github.com/apache/spark/pull/6046 + + [SPARK-7406] [STREAMING] [WEBUI] Add tooltips for "Scheduling Delay", "Processing Time" and "Total Delay" + zsxwing + 2015-05-12 14:41:21 -0700 + Commit: 221375e, github.com/apache/spark/pull/5952 + + [SPARK-7571] [MLLIB] rename Math to math + Xiangrui Meng + 2015-05-12 14:39:03 -0700 + Commit: 2555517, github.com/apache/spark/pull/6092 + + [SPARK-7484][SQL]Support jdbc connection properties + Venkata Ramana Gollamudi + 2015-05-12 14:37:23 -0700 + Commit: 32819fc, github.com/apache/spark/pull/6009 + + [SPARK-7559] [MLLIB] Bucketizer should include the right most boundary in the last bucket. + Xiangrui Meng + 2015-05-12 14:24:26 -0700 + Commit: 98ccd93, github.com/apache/spark/pull/6075 + + [SPARK-7569][SQL] Better error for invalid binary expressions + Michael Armbrust + 2015-05-12 13:36:55 -0700 + Commit: c68485e, github.com/apache/spark/pull/6089 + + [SPARK-7015] [MLLIB] [WIP] Multiclass to Binary Reduction: One Against All + Ram Sriharsha + 2015-05-12 13:35:12 -0700 + Commit: fd16709, github.com/apache/spark/pull/5830 + + [SPARK-2018] [CORE] Upgrade LZF library to fix endian serialization p… + Tim Ellison + 2015-05-12 20:48:26 +0100 + Commit: eadda92, github.com/apache/spark/pull/6077 + + [SPARK-7487] [ML] Feature Parity in PySpark for ml.regression + Burak Yavuz + 2015-05-12 12:17:05 -0700 + Commit: 432694c, github.com/apache/spark/pull/6016 + + [HOT FIX #6076] DAG visualization: curve the edges + Andrew Or + 2015-05-12 12:06:30 -0700 + Commit: ce6c400 + + [SPARK-7276] [DATAFRAME] speed up DataFrame.select by collapsing Project + Wenchen Fan + 2015-05-12 11:51:55 -0700 + Commit: 8be43f8, github.com/apache/spark/pull/5831 + + [SPARK-7500] DAG visualization: move cluster labeling to dagre-d3 + Andrew Or + 2015-05-12 11:17:59 -0700 + Commit: a236104, github.com/apache/spark/pull/6076 + + [DataFrame][minor] support column in field accessor + Wenchen Fan + 2015-05-12 10:37:57 -0700 + Commit: ec89286, github.com/apache/spark/pull/6080 + + [SPARK-3928] [SPARK-5182] [SQL] Partitioning support for the data sources API + Cheng Lian + 2015-05-13 01:32:28 +0800 + Commit: d232813, github.com/apache/spark/pull/5526 + + [DataFrame][minor] cleanup unapply methods in DataTypes + Wenchen Fan + 2015-05-12 10:28:40 -0700 + Commit: a9d84a9, github.com/apache/spark/pull/6079 + + [SPARK-6876] [PySpark] [SQL] add DataFrame na.replace in pyspark + Daoyuan Wang + 2015-05-12 10:23:41 -0700 + Commit: 653db0a, github.com/apache/spark/pull/6003 + + [SPARK-7532] [STREAMING] StreamingContext.start() made to logWarning and not throw exception + Tathagata Das + 2015-05-12 08:48:24 -0700 + Commit: 2bbb685, github.com/apache/spark/pull/6060 + + [SPARK-7467] Dag visualization: treat checkpoint as an RDD operation + Andrew Or + 2015-05-12 01:40:55 -0700 + Commit: 5601632, github.com/apache/spark/pull/6004 + + [SPARK-7485] [BUILD] Remove pyspark files from assembly. + Marcelo Vanzin + 2015-05-12 01:39:21 -0700 + Commit: afe54b7, github.com/apache/spark/pull/6022 + + [MINOR] [PYSPARK] Set PYTHONPATH to python/lib/pyspark.zip rather than python/pyspark + linweizhong + 2015-05-12 01:36:27 -0700 + Commit: 4092a2e, github.com/apache/spark/pull/6047 + + [SPARK-7534] [CORE] [WEBUI] Fix the Stage table when a stage is missing + zsxwing + 2015-05-12 01:34:33 -0700 + Commit: af374ed, github.com/apache/spark/pull/6061 + + [SPARK-6994][SQL] Update docs for fetching Row fields by name + vidmantas zemleris + 2015-05-11 22:29:24 -0700 + Commit: 6523fb8, github.com/apache/spark/pull/6030 + + [SQL] Rename Dialect -> ParserDialect. + Reynold Xin + 2015-05-11 22:06:56 -0700 + Commit: c6b8148, github.com/apache/spark/pull/6071 + + [SPARK-7435] [SPARKR] Make DataFrame.show() consistent with that of Scala and pySpark + Joshi , Rekha Joshi + 2015-05-11 21:02:34 -0700 + Commit: 835a770, github.com/apache/spark/pull/5989 + + [SPARK-7509][SQL] DataFrame.drop in Python for dropping columns. + Reynold Xin + 2015-05-11 20:04:36 -0700 + Commit: ed40ab5, github.com/apache/spark/pull/6068 + + [SPARK-7437] [SQL] Fold "literal in (item1, item2, ..., literal, ...)" into true or false directly + Zhongshuai Pei <799203320@qq.com>, DoingDone9 <799203320@qq.com> + 2015-05-11 19:22:44 -0700 + Commit: c30982d, github.com/apache/spark/pull/5972 + + [SPARK-7411] [SQL] Support SerDe for HiveQl in CTAS + Cheng Hao + 2015-05-11 19:21:16 -0700 + Commit: 1a664a0, github.com/apache/spark/pull/5963 + + [SPARK-7324] [SQL] DataFrame.dropDuplicates + Reynold Xin + 2015-05-11 19:15:14 -0700 + Commit: 8a9d234, github.com/apache/spark/pull/6066 + + [SPARK-7530] [STREAMING] Added StreamingContext.getState() to expose the current state of the context + Tathagata Das + 2015-05-11 18:53:50 -0700 + Commit: c16b47f, github.com/apache/spark/pull/6058 + + [SPARK-5893] [ML] Add bucketizer + Xusen Yin , Joseph K. Bradley + 2015-05-11 18:41:22 -0700 + Commit: f188815, github.com/apache/spark/pull/5980 + + Updated DataFrame.saveAsTable Hive warning to include SPARK-7550 ticket. + Reynold Xin + 2015-05-11 18:10:45 -0700 + Commit: e1e599d, github.com/apache/spark/pull/6067 + + [SPARK-7462][SQL] Update documentation for retaining grouping columns in DataFrames. + Reynold Xin + 2015-05-11 18:07:12 -0700 + Commit: eaa6116, github.com/apache/spark/pull/6062 + + [SPARK-7084] improve saveAsTable documentation + madhukar + 2015-05-11 17:04:11 -0700 + Commit: 0dbfe16, github.com/apache/spark/pull/5654 + + [SQL] Show better error messages for incorrect join types in DataFrames. + Reynold Xin + 2015-05-11 17:02:11 -0700 + Commit: 0ff34f80, github.com/apache/spark/pull/6064 + + Update Documentation: leftsemi instead of semijoin + LCY Vincent + 2015-05-11 14:48:10 -0700 + Commit: 788503a, github.com/apache/spark/pull/5944 + + [STREAMING] [MINOR] Close files correctly when iterator is finished in streaming WAL recovery + jerryshao + 2015-05-11 14:38:58 -0700 + Commit: 9e226e1, github.com/apache/spark/pull/6050 + + [SPARK-7516] [Minor] [DOC] Replace depreciated inferSchema() with createDataFrame() + gchen + 2015-05-11 14:37:18 -0700 + Commit: 1538b10, github.com/apache/spark/pull/6041 + + [SPARK-7508] JettyUtils-generated servlets to log & report all errors + Steve Loughran + 2015-05-11 13:35:06 -0700 + Commit: 779174a, github.com/apache/spark/pull/6033 + + [SPARK-7462] By default retain group by columns in aggregate + Reynold Xin , Shivaram Venkataraman + 2015-05-11 11:35:16 -0700 + Commit: 9c35f02, github.com/apache/spark/pull/5996 + + [SPARK-7361] [STREAMING] Throw unambiguous exception when attempting to start multiple StreamingContexts in the same JVM + Tathagata Das + 2015-05-11 10:58:56 -0700 + Commit: 11648fa, github.com/apache/spark/pull/5907 + + [SPARK-7522] [EXAMPLES] Removed angle brackets from dataFormat option + Bryan Cutler + 2015-05-11 09:23:47 -0700 + Commit: c234d78, github.com/apache/spark/pull/6049 + + [SPARK-6092] [MLLIB] Add RankingMetrics in PySpark/MLlib + Yanbo Liang + 2015-05-11 09:14:20 -0700 + Commit: 017f9fa, github.com/apache/spark/pull/6044 + + [SPARK-7326] [STREAMING] Performing window() on a WindowedDStream doesn't work all the time + Wesley Miao , Wesley + 2015-05-11 12:20:06 +0100 + Commit: da1be15, github.com/apache/spark/pull/5871 + + [SPARK-7519] [SQL] fix minor bugs in thrift server UI + tianyi + 2015-05-11 14:08:15 +0800 + Commit: fff3c86, github.com/apache/spark/pull/6048 + + [SPARK-7512] [SPARKR] Fix RDD's show method to use getJRDD + Shivaram Venkataraman + 2015-05-10 19:49:42 -0700 + Commit: 5f227fd, github.com/apache/spark/pull/6035 + + [SPARK-7427] [PYSPARK] Make sharedParams match in Scala, Python + Glenn Weidner + 2015-05-10 19:18:32 -0700 + Commit: 051864e, github.com/apache/spark/pull/6023 + + [SPARK-5521] PCA wrapper for easy transform vectors + Kirill A. Korinskiy , Joseph K. Bradley + 2015-05-10 13:34:00 -0700 + Commit: 193ff69, github.com/apache/spark/pull/4304 + + [SPARK-7431] [ML] [PYTHON] Made CrossValidatorModel call parent init in PySpark + Joseph K. Bradley + 2015-05-10 13:29:27 -0700 + Commit: d49b72c, github.com/apache/spark/pull/5968 + + [MINOR] [SQL] Fixes variable name typo + Cheng Lian + 2015-05-10 21:26:36 +0800 + Commit: fd87b2a, github.com/apache/spark/pull/6038 + + [SPARK-7345][SQL] Spark cannot detect renamed columns using JDBC connector + Oleg Sidorkin + 2015-05-10 01:31:34 -0700 + Commit: 5c40403, github.com/apache/spark/pull/6032 + + [SPARK-6091] [MLLIB] Add MulticlassMetrics in PySpark/MLlib + Yanbo Liang + 2015-05-10 00:57:14 -0700 + Commit: fe46374, github.com/apache/spark/pull/6011 + + [SPARK-7475] [MLLIB] adjust ldaExample for online LDA + Yuhao Yang + 2015-05-09 15:40:46 -0700 + Commit: e96fc86, github.com/apache/spark/pull/6000 + + [BUILD] Reference fasterxml.jackson.version in sql/core/pom.xml + tedyu + 2015-05-09 13:19:07 -0700 + Commit: 5110f3e, github.com/apache/spark/pull/6031 + + Upgrade version of jackson-databind in sql/core/pom.xml + tedyu + 2015-05-09 10:41:30 -0700 + Commit: 6c5b9ff, github.com/apache/spark/pull/6028 + + [STREAMING] [DOCS] Fix wrong url about API docs of StreamingListener + dobashim + 2015-05-09 10:14:46 +0100 + Commit: 5dbc7bb, github.com/apache/spark/pull/6024 + + [SPARK-7403] [WEBUI] Link URL in objects on Timeline View is wrong in case of running on YARN + Kousuke Saruta + 2015-05-09 10:10:29 +0100 + Commit: 869a52d, github.com/apache/spark/pull/5947 + + [SPARK-7438] [SPARK CORE] Fixed validation of relativeSD in countApproxDistinct + Vinod K C + 2015-05-09 10:03:15 +0100 + Commit: b0460f4, github.com/apache/spark/pull/5974 + + [SPARK-7498] [ML] removed varargs annotation from Params.setDefaults + Joseph K. Bradley + 2015-05-08 21:55:54 -0700 + Commit: 25972d3, github.com/apache/spark/pull/6021 + + [SPARK-7262] [ML] Binary LogisticRegression with L1/L2 (elastic net) using OWLQN in new ML package + DB Tsai + 2015-05-08 21:43:05 -0700 + Commit: 80bbe72, github.com/apache/spark/pull/5967 + + [SPARK-7375] [SQL] Avoid row copying in exchange when sort.serializeMapOutputs takes effect + Josh Rosen + 2015-05-08 22:09:55 -0400 + Commit: 21212a2, github.com/apache/spark/pull/5948 + + [SPARK-7231] [SPARKR] Changes to make SparkR DataFrame dplyr friendly. + Shivaram Venkataraman + 2015-05-08 18:29:57 -0700 + Commit: 448ff33, github.com/apache/spark/pull/6005 + + [SPARK-7451] [YARN] Preemption of executors is counted as failure causing Spark job to fail + Ashwin Shankar + 2015-05-08 17:51:00 -0700 + Commit: 959c7b6, github.com/apache/spark/pull/5993 + + [SPARK-7488] [ML] Feature Parity in PySpark for ml.recommendation + Burak Yavuz + 2015-05-08 17:24:32 -0700 + Commit: 85cab34, github.com/apache/spark/pull/6015 + + [SPARK-7237] Clean function in several RDD methods + tedyu + 2015-05-08 17:16:38 -0700 + Commit: 45b6215, github.com/apache/spark/pull/5959 + + [SPARK-7469] [SQL] DAG visualization: show SQL query operators + Andrew Or + 2015-05-08 17:15:10 -0700 + Commit: cafffd0, github.com/apache/spark/pull/5999 + + [SPARK-6955] Perform port retries at NettyBlockTransferService level + Aaron Davidson + 2015-05-08 17:13:55 -0700 + Commit: 1eae476, github.com/apache/spark/pull/5575 + + updated ec2 instance types + Brendan Collins + 2015-05-08 15:59:34 -0700 + Commit: 6e35cb5, github.com/apache/spark/pull/6014 + + [SPARK-5913] [MLLIB] Python API for ChiSqSelector + Yanbo Liang + 2015-05-08 15:48:39 -0700 + Commit: ab48df3, github.com/apache/spark/pull/5939 + + [SPARK-4699] [SQL] Make caseSensitive configurable in spark sql analyzer + Jacky Li , wangfei , scwf + 2015-05-08 15:25:54 -0700 + Commit: 21bd722, github.com/apache/spark/pull/5806 + + [SPARK-7390] [SQL] Only merge other CovarianceCounter when its count is greater than zero + Liang-Chi Hsieh + 2015-05-08 14:41:16 -0700 + Commit: 5205eb4, github.com/apache/spark/pull/5931 + + [SPARK-7378] [CORE] Handle deep links to unloaded apps. + Marcelo Vanzin + 2015-05-08 14:12:58 -0700 + Commit: 3024f6b, github.com/apache/spark/pull/5922 + + [MINOR] [CORE] Allow History Server to read kerberos opts from config file. + Marcelo Vanzin + 2015-05-08 14:10:27 -0700 + Commit: 3da5f8b, github.com/apache/spark/pull/5998 + + [SPARK-7466] DAG visualization: fix orphan nodes + Andrew Or + 2015-05-08 14:09:39 -0700 + Commit: ca2f1c5, github.com/apache/spark/pull/6002 + + [MINOR] Defeat early garbage collection of test suite variable + Tim Ellison + 2015-05-08 14:08:52 -0700 + Commit: f734c58, github.com/apache/spark/pull/6010 + + [SPARK-7489] [SPARK SHELL] Spark shell crashes when compiled with scala 2.11 + vinodkc + 2015-05-08 14:07:53 -0700 + Commit: 3b7fb7a, github.com/apache/spark/pull/6013 + + [WEBUI] Remove debug feature for vis.js + Kousuke Saruta + 2015-05-08 14:06:37 -0700 + Commit: 1dde3b3, github.com/apache/spark/pull/5994 + + [MINOR] Ignore python/lib/pyspark.zip + zsxwing + 2015-05-08 14:06:02 -0700 + Commit: ab0caa0, github.com/apache/spark/pull/6017 + + [SPARK-7490] [CORE] [Minor] MapOutputTracker.deserializeMapStatuses: close input streams + Evan Jones + 2015-05-08 22:00:39 +0100 + Commit: 6230809, github.com/apache/spark/pull/5982 + + [SPARK-6627] Finished rename to ShuffleBlockResolver + Kay Ousterhout + 2015-05-08 12:24:06 -0700 + Commit: 82be68f, github.com/apache/spark/pull/5764 + + [SPARK-7133] [SQL] Implement struct, array, and map field accessor + Wenchen Fan + 2015-05-08 11:49:38 -0700 + Commit: f8468c4, github.com/apache/spark/pull/5744 + + [SPARK-7298] Harmonize style of new visualizations + Matei Zaharia + 2015-05-08 14:41:42 -0400 + Commit: 0b2c252, github.com/apache/spark/pull/5942 + + [SPARK-7436] Fixed instantiation of custom recovery mode factory and added tests + Jacek Lewandowski + 2015-05-08 11:38:09 -0700 + Commit: 89d9487, github.com/apache/spark/pull/5976 + + [SPARK-6824] Fill the docs for DataFrame API in SparkR + hqzizania , qhuang + 2015-05-08 11:25:04 -0700 + Commit: 4f01f5b, github.com/apache/spark/pull/5969 + + [SPARK-7474] [MLLIB] update ParamGridBuilder doctest + Xiangrui Meng + 2015-05-08 11:16:04 -0700 + Commit: 75fed0c, github.com/apache/spark/pull/6001 + + [SPARK-7383] [ML] Feature Parity in PySpark for ml.features + Burak Yavuz + 2015-05-08 11:14:39 -0700 + Commit: 85e1154, github.com/apache/spark/pull/5991 + + [SPARK-3454] separate json endpoints for data in the UI + Imran Rashid + 2015-05-08 16:54:32 +0100 + Commit: 532bfda, github.com/apache/spark/pull/5940 + + [SPARK-6869] [PYSPARK] Add pyspark archives path to PYTHONPATH + Lianhui Wang + 2015-05-08 08:44:46 -0500 + Commit: acf4bc1, github.com/apache/spark/pull/5580 + + [SPARK-7392] [CORE] bugfix: Kryo buffer size cannot be larger than 2M + Zhang, Liye + 2015-05-08 09:10:58 +0100 + Commit: f5e9678, github.com/apache/spark/pull/5934 + + [SPARK-7232] [SQL] Add a Substitution batch for spark sql analyzer + wangfei + 2015-05-07 22:55:42 -0700 + Commit: bb5872f, github.com/apache/spark/pull/5776 + + [SPARK-7470] [SQL] Spark shell SQLContext crashes without hive + Andrew Or + 2015-05-07 22:32:13 -0700 + Commit: 1a3e9e9, github.com/apache/spark/pull/5997 + + [SPARK-6986] [SQL] Use Serializer2 in more cases. + Yin Huai + 2015-05-07 20:59:42 -0700 + Commit: 9d0d289, github.com/apache/spark/pull/5849 + + [SPARK-7452] [MLLIB] fix bug in topBykey and update test + Shuo Xiang + 2015-05-07 20:55:08 -0700 + Commit: 28d4238, github.com/apache/spark/pull/5990 + + [SPARK-6908] [SQL] Use isolated Hive client + Michael Armbrust + 2015-05-07 19:36:24 -0700 + Commit: 05454fd, github.com/apache/spark/pull/5876 + + [SPARK-7305] [STREAMING] [WEBUI] Make BatchPage show friendly information when jobs are dropped by SparkListener + zsxwing + 2015-05-07 17:34:44 -0700 + Commit: 2e8a141, github.com/apache/spark/pull/5840 + + [SPARK-7450] Use UNSAFE.getLong() to speed up BitSetMethods#anySet() + tedyu + 2015-05-07 16:53:59 -0700 + Commit: 99897fe, github.com/apache/spark/pull/5897 + + [SPARK-2155] [SQL] [WHEN D THEN E] [ELSE F] add CaseKeyWhen for "CASE a WHEN b THEN c * END" + Wenchen Fan + 2015-05-07 16:26:49 -0700 + Commit: 622a0c5, github.com/apache/spark/pull/5979 + + [SPARK-5281] [SQL] Registering table on RDD is giving MissingRequirementError + Iulian Dragos + 2015-05-07 16:24:11 -0700 + Commit: 9fd25f7, github.com/apache/spark/pull/5981 + + [SPARK-7277] [SQL] Throw exception if the property mapred.reduce.tasks is set to -1 + Liang-Chi Hsieh + 2015-05-07 16:22:45 -0700 + Commit: 7064ea0, github.com/apache/spark/pull/5811 + + [SQL] [MINOR] make star and multialias extend NamedExpression + scwf + 2015-05-07 16:21:24 -0700 + Commit: 2425e4d, github.com/apache/spark/pull/5928 + + [SPARK-6948] [MLLIB] compress vectors in VectorAssembler + Xiangrui Meng + 2015-05-07 15:45:37 -0700 + Commit: 475143a, github.com/apache/spark/pull/5985 + + [SPARK-5726] [MLLIB] Elementwise (Hadamard) Vector Product Transformer + Octavian Geagla , Joseph K. Bradley + 2015-05-07 14:49:55 -0700 + Commit: 76e58b5, github.com/apache/spark/pull/4580 + + [SPARK-7328] [MLLIB] [PYSPARK] Pyspark.mllib.linalg.Vectors: Missing items + MechCoder + 2015-05-07 14:02:05 -0700 + Commit: 4436e26, github.com/apache/spark/pull/5872 + + [SPARK-7347] DAG visualization: add tooltips to RDDs + Andrew Or + 2015-05-07 12:29:56 -0700 + Commit: 1b742a4, github.com/apache/spark/pull/5957 + + [SPARK-7391] DAG visualization: auto expand if linked from another viz + Andrew Or + 2015-05-07 12:29:18 -0700 + Commit: 800c0fc, github.com/apache/spark/pull/5958 + + [SPARK-7373] [MESOS] Add docker support for launching drivers in mesos cluster mode. + Timothy Chen + 2015-05-07 12:23:16 -0700 + Commit: 226033c, github.com/apache/spark/pull/5917 + + [SPARK-7399] [SPARK CORE] Fixed compilation error in scala 2.11 + Tijo Thomas + 2015-05-07 12:21:09 -0700 + Commit: d4e31bf, github.com/apache/spark/pull/5966 + + [SPARK-5213] [SQL] Remove the duplicated SparkSQLParser + Cheng Hao + 2015-05-07 12:09:54 -0700 + Commit: 2b0c423, github.com/apache/spark/pull/5965 + + [SPARK-7116] [SQL] [PYSPARK] Remove cache() causing memory leak + ksonj + 2015-05-07 12:04:19 -0700 + Commit: 86f141c, github.com/apache/spark/pull/5973 + + [SPARK-1442] [SQL] [FOLLOW-UP] Address minor comments in Window Function PR (#5604). + Yin Huai + 2015-05-07 11:46:49 -0700 + Commit: 9dcf4f7, github.com/apache/spark/pull/5945 + + [SPARK-6093] [MLLIB] Add RegressionMetrics in PySpark/MLlib + Yanbo Liang + 2015-05-07 11:18:32 -0700 + Commit: ef835dc, github.com/apache/spark/pull/5941 + + [SPARK-7118] [Python] Add the coalesce Spark SQL function available in PySpark + Olivier Girardot + 2015-05-07 10:58:35 -0700 + Commit: 3038b26, github.com/apache/spark/pull/5698 + + [SPARK-7388] [SPARK-7383] wrapper for VectorAssembler in Python + Burak Yavuz , Xiangrui Meng + 2015-05-07 10:25:41 -0700 + Commit: 6b9737a, github.com/apache/spark/pull/5930 + + [SPARK-7330] [SQL] avoid NPE at jdbc rdd + Daoyuan Wang + 2015-05-07 10:05:01 -0700 + Commit: 84ee348, github.com/apache/spark/pull/5877 + + [SPARK-7429] [ML] Params cleanups + Joseph K. Bradley + 2015-05-07 01:28:44 -0700 + Commit: 91ce131, github.com/apache/spark/pull/5960 + + [SPARK-7421] [MLLIB] OnlineLDA cleanups + Joseph K. Bradley + 2015-05-07 01:12:14 -0700 + Commit: a038c51, github.com/apache/spark/pull/5956 + + [SPARK-7035] Encourage __getitem__ over __getattr__ on column access in the Python DataFrame API + ksonj + 2015-05-07 01:02:00 -0700 + Commit: b929a75, github.com/apache/spark/pull/5971 + + [SPARK-7295][SQL] bitwise operations for DataFrame DSL + Shiti + 2015-05-07 01:00:29 -0700 + Commit: 703211b, github.com/apache/spark/pull/5867 + + [SPARK-7217] [STREAMING] Add configuration to control the default behavior of StreamingContext.stop() implicitly calling SparkContext.stop() + Tathagata Das + 2015-05-07 00:24:44 -0700 + Commit: cb13c98, github.com/apache/spark/pull/5929 + + [SPARK-7430] [STREAMING] [TEST] General improvements to streaming tests to increase debuggability + Tathagata Das + 2015-05-07 00:21:10 -0700 + Commit: 065d114, github.com/apache/spark/pull/5961 + + [SPARK-5938] [SPARK-5443] [SQL] Improve JsonRDD performance + Nathan Howell + 2015-05-06 22:56:53 -0700 + Commit: 2337ccc1, github.com/apache/spark/pull/5801 + + [SPARK-6812] [SPARKR] filter() on DataFrame does not work as expected. + Sun Rui + 2015-05-06 22:48:16 -0700 + Commit: 4948f42, github.com/apache/spark/pull/5938 + + [SPARK-7432] [MLLIB] disable cv doctest + Xiangrui Meng + 2015-05-06 22:29:07 -0700 + Commit: fb4967b, github.com/apache/spark/pull/5962 + + [SPARK-7405] [STREAMING] Fix the bug that ReceiverInputDStream doesn't report InputInfo + zsxwing + 2015-05-06 18:07:00 -0700 + Commit: d6e76cb, github.com/apache/spark/pull/5950 + + [HOT FIX] For DAG visualization #5954 + Andrew Or + 2015-05-06 18:02:08 -0700 + Commit: 85a644b + + [SPARK-7371] [SPARK-7377] [SPARK-7408] DAG visualization addendum (#5729) + Andrew Or + 2015-05-06 17:52:34 -0700 + Commit: 76e8344, github.com/apache/spark/pull/5954 + + [SPARK-7396] [STREAMING] [EXAMPLE] Update KafkaWordCountProducer to use new Producer API + jerryshao + 2015-05-06 17:44:43 -0700 + Commit: ba24dfa, github.com/apache/spark/pull/5936 + + [SPARK-6799] [SPARKR] Remove SparkR RDD examples, add dataframe examples + Shivaram Venkataraman + 2015-05-06 17:28:11 -0700 + Commit: 4b91e18, github.com/apache/spark/pull/5949 + + [HOT FIX] [SPARK-7418] Ignore flaky SparkSubmitUtilsSuite test + Andrew Or + 2015-05-06 17:08:39 -0700 + Commit: c0ec20a + + [SPARK-5995] [ML] Make Prediction dev API public + Joseph K. Bradley + 2015-05-06 16:15:51 -0700 + Commit: b681b93, github.com/apache/spark/pull/5913 + + [HOT-FIX] Move HiveWindowFunctionQuerySuite.scala to hive compatibility dir. + Yin Huai + 2015-05-06 14:48:25 -0700 + Commit: 14bcb84, github.com/apache/spark/pull/5951 + + Add `Private` annotation. + Josh Rosen + 2015-05-06 11:03:17 -0700 + Commit: 2163367 + + [SPARK-7311] Introduce internal Serializer API for determining if serializers support object relocation + Josh Rosen + 2015-05-06 10:52:55 -0700 + Commit: d651e28, github.com/apache/spark/pull/5924 + + [SPARK-1442] [SQL] Window Function Support for Spark SQL + Yin Huai + 2015-05-06 10:43:00 -0700 + Commit: b521a3b, github.com/apache/spark/pull/5604 + + [SPARK-6201] [SQL] promote string and do widen types for IN + Daoyuan Wang + 2015-05-06 10:30:42 -0700 + Commit: 7212897, github.com/apache/spark/pull/4945 + + [SPARK-5456] [SQL] fix decimal compare for jdbc rdd + Daoyuan Wang + 2015-05-06 10:05:10 -0700 + Commit: f1a5caf, github.com/apache/spark/pull/5803 + + [SQL] JavaDoc update for various DataFrame functions. + Reynold Xin + 2015-05-06 08:50:56 -0700 + Commit: 389b755, github.com/apache/spark/pull/5935 + + [SPARK-6940] [MLLIB] Add CrossValidator to Python ML pipeline API + Xiangrui Meng + 2015-05-06 01:28:43 -0700 + Commit: 3e27a54, github.com/apache/spark/pull/5926 + + [SPARK-7384][Core][Tests] Fix flaky tests for distributed mode in BroadcastSuite + zsxwing + 2015-05-05 23:25:28 -0700 + Commit: 20f9237, github.com/apache/spark/pull/5925 + + [SPARK-6267] [MLLIB] Python API for IsotonicRegression + Yanbo Liang , Xiangrui Meng + 2015-05-05 22:57:13 -0700 + Commit: 384ac3c, github.com/apache/spark/pull/5890 + + [SPARK-7358][SQL] Move DataFrame mathfunctions into functions + Burak Yavuz + 2015-05-05 22:56:01 -0700 + Commit: 8aa6681, github.com/apache/spark/pull/5923 + + [SPARK-6841] [SPARKR] add support for mean, median, stdev etc. + qhuang + 2015-05-05 20:39:56 -0700 + Commit: b5cd7dc, github.com/apache/spark/pull/5446 + + Revert "[SPARK-3454] separate json endpoints for data in the UI" + Reynold Xin + 2015-05-05 19:28:35 -0700 + Commit: 765f6e1 + + [SPARK-6231][SQL/DF] Automatically resolve join condition ambiguity for self-joins. + Reynold Xin + 2015-05-05 18:59:46 -0700 + Commit: e61083c, github.com/apache/spark/pull/5919 + + Some minor cleanup after SPARK-4550. + Sandy Ryza + 2015-05-05 18:32:16 -0700 + Commit: 762ff2e, github.com/apache/spark/pull/5916 + + [SPARK-7230] [SPARKR] Make RDD private in SparkR. + Shivaram Venkataraman + 2015-05-05 14:40:33 -0700 + Commit: 4afb578, github.com/apache/spark/pull/5895 + + [SQL][Minor] make StringComparison extends ExpectsInputTypes + wangfei + 2015-05-05 14:24:37 -0700 + Commit: b6566a2, github.com/apache/spark/pull/5905 + + [SPARK-7351] [STREAMING] [DOCS] Add spark.streaming.ui.retainedBatches to docs + zsxwing + 2015-05-05 13:42:23 -0700 + Commit: 4c95fe5, github.com/apache/spark/pull/5899 + + [SPARK-7294][SQL] ADD BETWEEN + 云峤 , kaka1992 + 2015-05-05 13:23:53 -0700 + Commit: c68d0e2, github.com/apache/spark/pull/5839 + + [SPARK-6939] [STREAMING] [WEBUI] Add timeline and histogram graphs for streaming statistics + zsxwing + 2015-05-05 12:52:16 -0700 + Commit: 8109c9e, github.com/apache/spark/pull/5533 + + [SPARK-5888] [MLLIB] Add OneHotEncoder as a Transformer + Sandy Ryza + 2015-05-05 12:34:02 -0700 + Commit: 94ac9eb, github.com/apache/spark/pull/5500 + + [SPARK-7333] [MLLIB] Add BinaryClassificationEvaluator to PySpark + Xiangrui Meng + 2015-05-05 11:45:37 -0700 + Commit: dfb6bfc, github.com/apache/spark/pull/5885 + + [SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames + Burak Yavuz + 2015-05-05 11:01:25 -0700 + Commit: 598902b, github.com/apache/spark/pull/5900 + + [SPARK-7007] [CORE] Add a metric source for ExecutorAllocationManager + jerryshao + 2015-05-05 09:43:49 -0700 + Commit: 29350ee, github.com/apache/spark/pull/5589 + + [SPARK-7318] [STREAMING] DStream cleans objects that are not closures + Andrew Or + 2015-05-05 09:37:49 -0700 + Commit: acc877a, github.com/apache/spark/pull/5860 + + [SPARK-7237] Many user provided closures are not actually cleaned + Andrew Or + 2015-05-05 09:37:04 -0700 + Commit: 01d4022, github.com/apache/spark/pull/5787 + + [SPARK-6612] [MLLIB] [PYSPARK] Python KMeans parity + Hrishikesh Subramonian + 2015-05-05 07:57:39 -0700 + Commit: 8b63103, github.com/apache/spark/pull/5647 + + [SPARK-7202] [MLLIB] [PYSPARK] Add SparseMatrixPickler to SerDe + MechCoder + 2015-05-05 07:53:11 -0700 + Commit: cd55e9a, github.com/apache/spark/pull/5775 + + [SPARK-7350] [STREAMING] [WEBUI] Attach the Streaming tab when calling ssc.start() + zsxwing + 2015-05-05 15:09:58 +0100 + Commit: 49923f7, github.com/apache/spark/pull/5898 + + [SPARK-5074] [CORE] [TESTS] Fix the flakey test 'run shuffle with map stage failure' in DAGSchedulerSuite + zsxwing + 2015-05-05 15:04:14 +0100 + Commit: 6f35dac, github.com/apache/spark/pull/5903 + + [MINOR] Minor update for document + Liang-Chi Hsieh + 2015-05-05 14:44:02 +0100 + Commit: d288322, github.com/apache/spark/pull/5906 + + [SPARK-3454] separate json endpoints for data in the UI + Imran Rashid + 2015-05-05 07:25:40 -0500 + Commit: ff8b449, github.com/apache/spark/pull/4435 + + [SPARK-5112] Expose SizeEstimator as a developer api + Sandy Ryza + 2015-05-05 12:38:46 +0100 + Commit: 0327ca2, github.com/apache/spark/pull/3913 + + [SPARK-6653] [YARN] New config to specify port for sparkYarnAM actor system + shekhar.bansal + 2015-05-05 11:09:51 +0100 + Commit: 93af96a, github.com/apache/spark/pull/5719 + + [SPARK-7341] [STREAMING] [TESTS] Fix the flaky test: org.apache.spark.stre... + zsxwing + 2015-05-05 02:15:39 -0700 + Commit: 0634510, github.com/apache/spark/pull/5891 + + [SPARK-7113] [STREAMING] Support input information reporting for Direct Kafka stream + jerryshao + 2015-05-05 02:01:06 -0700 + Commit: becdb81, github.com/apache/spark/pull/5879 + + [HOTFIX] [TEST] Ignoring flaky tests + Tathagata Das + 2015-05-05 01:58:51 -0700 + Commit: e8f847a, github.com/apache/spark/pull/5901 + + [SPARK-7139] [STREAMING] Allow received block metadata to be saved to WAL and recovered on driver failure + Tathagata Das + 2015-05-05 01:45:19 -0700 + Commit: ae27c0e, github.com/apache/spark/pull/5732 + + [MINOR] [BUILD] Declare ivy dependency in root pom. + Marcelo Vanzin + 2015-05-05 08:56:16 +0100 + Commit: 5160437, github.com/apache/spark/pull/5893 + + [SPARK-7314] [SPARK-3524] [PYSPARK] upgrade Pyrolite to 4.4 + Xiangrui Meng + 2015-05-04 23:52:42 -0700 + Commit: 21ed108, github.com/apache/spark/pull/5850 + + [SPARK-7236] [CORE] Fix to prevent AkkaUtils askWithReply from sleeping on final attempt + Bryan Cutler + 2015-05-04 18:29:22 -0700 + Commit: 48655d1, github.com/apache/spark/pull/5896 + + [SPARK-7266] Add ExpectsInputTypes to expressions when possible. + Reynold Xin + 2015-05-04 18:03:07 -0700 + Commit: 1388a46, github.com/apache/spark/pull/5796 + + [SPARK-7243][SQL] Contingency Tables for DataFrames + Burak Yavuz + 2015-05-04 17:02:49 -0700 + Commit: ecf0d8a, github.com/apache/spark/pull/5842 + + [SPARK-6943] [SPARK-6944] DAG visualization on SparkUI + Andrew Or + 2015-05-04 16:21:36 -0700 + Commit: 863ec0c, github.com/apache/spark/pull/5729 + + [SPARK-7319][SQL] Improve the output from DataFrame.show() + 云峤 + 2015-05-04 12:08:38 -0700 + Commit: 34edaa8, github.com/apache/spark/pull/5865 + + [SPARK-5956] [MLLIB] Pipeline components should be copyable. + Xiangrui Meng + 2015-05-04 11:28:59 -0700 + Commit: 893b310, github.com/apache/spark/pull/5820 + + [SPARK-5100] [SQL] add webui for thriftserver + tianyi + 2015-05-04 16:59:34 +0800 + Commit: 343d3bf, github.com/apache/spark/pull/5730 + + [SPARK-5563] [MLLIB] LDA with online variational inference + Yuhao Yang , Joseph K. Bradley + 2015-05-04 00:06:25 -0700 + Commit: 3539cb7, github.com/apache/spark/pull/4419 + + [SPARK-7241] Pearson correlation for DataFrames + Burak Yavuz + 2015-05-03 21:44:39 -0700 + Commit: 9646018, github.com/apache/spark/pull/5858 + + [SPARK-7329] [MLLIB] simplify ParamGridBuilder impl + Xiangrui Meng + 2015-05-03 18:06:48 -0700 + Commit: 1ffa8cb, github.com/apache/spark/pull/5873 + + [SPARK-7302] [DOCS] SPARK building documentation still mentions building for yarn 0.23 + Sean Owen + 2015-05-03 21:22:31 +0100 + Commit: 9e25b09, github.com/apache/spark/pull/5863 + + [SPARK-6907] [SQL] Isolated client for HiveMetastore + Michael Armbrust + 2015-05-03 13:12:50 -0700 + Commit: daa70bf, github.com/apache/spark/pull/5851 + + [SPARK-7022] [PYSPARK] [ML] Add ML.Tuning.ParamGridBuilder to PySpark + Omede Firouz , Omede + 2015-05-03 11:42:02 -0700 + Commit: f4af925, github.com/apache/spark/pull/5601 + + [SPARK-7031] [THRIFTSERVER] let thrift server take SPARK_DAEMON_MEMORY and SPARK_DAEMON_JAVA_OPTS + WangTaoTheTonic + 2015-05-03 00:47:47 +0100 + Commit: 49549d5, github.com/apache/spark/pull/5609 + + [SPARK-7255] [STREAMING] [DOCUMENTATION] Added documentation for spark.streaming.kafka.maxRetries + BenFradet + 2015-05-02 23:41:14 +0100 + Commit: ea841ef, github.com/apache/spark/pull/5808 + + [SPARK-5213] [SQL] Pluggable SQL Parser Support + Cheng Hao , scwf + 2015-05-02 15:20:07 -0700 + Commit: 5d6b90d, github.com/apache/spark/pull/5827 + + [MINOR] [HIVE] Fix QueryPartitionSuite. + Marcelo Vanzin + 2015-05-02 23:10:35 +0100 + Commit: 82c8c37, github.com/apache/spark/pull/5854 + + [SPARK-6030] [CORE] Using simulated field layout method to compute class shellSize + Ye Xianjin + 2015-05-02 23:08:09 +0100 + Commit: bfcd528, github.com/apache/spark/pull/4783 + + [SPARK-7323] [SPARK CORE] Use insertAll instead of insert while merging combiners in reducer + Mridul Muralidharan + 2015-05-02 23:05:51 +0100 + Commit: da30352, github.com/apache/spark/pull/5862 + + [SPARK-3444] Fix typo in Dataframes.py introduced in [] + Dean Chen + 2015-05-02 23:04:13 +0100 + Commit: 856a571, github.com/apache/spark/pull/5866 + + [SPARK-7315] [STREAMING] [TEST] Fix flaky WALBackedBlockRDDSuite + Tathagata Das + 2015-05-02 01:53:14 -0700 + Commit: ecc6eb5, github.com/apache/spark/pull/5853 + + [SPARK-7120] [SPARK-7121] Closure cleaner nesting + documentation + tests + Andrew Or + 2015-05-01 23:57:58 -0700 + Commit: 7394e7a, github.com/apache/spark/pull/5685 + + [SPARK-7242] added python api for freqItems in DataFrames + Burak Yavuz + 2015-05-01 23:43:24 -0700 + Commit: 2e0f357, github.com/apache/spark/pull/5859 + + [SPARK-7317] [Shuffle] Expose shuffle handle + Mridul Muralidharan + 2015-05-01 21:23:42 -0700 + Commit: b79aeb9, github.com/apache/spark/pull/5857 + + [SPARK-6229] Add SASL encryption to network library. + Marcelo Vanzin + 2015-05-01 19:01:46 -0700 + Commit: 38d4e9e, github.com/apache/spark/pull/5377 + + [SPARK-2691] [MESOS] Support for Mesos DockerInfo + Chris Heller + 2015-05-01 18:41:22 -0700 + Commit: 8f50a07, github.com/apache/spark/pull/3074 + + [SPARK-6443] [SPARK SUBMIT] Could not submit app in standalone cluster mode when HA is enabled + WangTaoTheTonic + 2015-05-01 18:38:20 -0700 + Commit: b4b43df, github.com/apache/spark/pull/5116 + + [SPARK-7216] [MESOS] Add driver details page to Mesos cluster UI. + Timothy Chen + 2015-05-01 18:36:42 -0700 + Commit: 2022193, github.com/apache/spark/pull/5763 + + [SPARK-6954] [YARN] ExecutorAllocationManager can end up requesting a negative n... + Sandy Ryza + 2015-05-01 18:32:46 -0700 + Commit: 099327d, github.com/apache/spark/pull/5704 + + [SPARK-3444] Provide an easy way to change log level + Holden Karau + 2015-05-01 18:02:10 -0700 + Commit: ae98eec, github.com/apache/spark/pull/5791 + + [SPARK-2808][Streaming][Kafka] update kafka to 0.8.2 + cody koeninger , Helena Edelson + 2015-05-01 17:54:56 -0700 + Commit: 4786484, github.com/apache/spark/pull/4537 + + [SPARK-7112][Streaming][WIP] Add a InputInfoTracker to track all the input streams + jerryshao , Saisai Shao + 2015-05-01 17:46:06 -0700 + Commit: b88c275, github.com/apache/spark/pull/5680 + + [SPARK-7309] [CORE] [STREAMING] Shutdown the thread pools in ReceivedBlockHandler and DAGScheduler + zsxwing + 2015-05-01 17:41:55 -0700 + Commit: ebc25a4, github.com/apache/spark/pull/5845 + + [SPARK-6999] [SQL] Remove the infinite recursive method (useless) + Cheng Hao + 2015-05-01 19:39:30 -0500 + Commit: 98e7045, github.com/apache/spark/pull/5804 + + [SPARK-7304] [BUILD] Include $@ in call to mvn consistently in make-distribution.sh + Rajendra Gokhale (rvgcentos) + 2015-05-01 17:01:36 -0700 + Commit: e6fb377, github.com/apache/spark/pull/5846 + + [SPARK-7312][SQL] SPARK-6913 broke jdk6 build + Yin Huai + 2015-05-01 16:47:00 -0700 + Commit: 41c6a44, github.com/apache/spark/pull/5847 + + Ignore flakey test in SparkSubmitUtilsSuite + Patrick Wendell + 2015-05-01 14:42:58 -0700 + Commit: 5c1faba + + [SPARK-5342] [YARN] Allow long running Spark apps to run on secure YARN/HDFS + Hari Shreedharan + 2015-05-01 15:32:09 -0500 + Commit: b1f4ca8, github.com/apache/spark/pull/5823 + + [SPARK-7240][SQL] Single pass covariance calculation for dataframes + Burak Yavuz + 2015-05-01 13:29:17 -0700 + Commit: 4dc8d74, github.com/apache/spark/pull/5825 + + [SPARK-7281] [YARN] Add option to set AM's lib path in client mode. + Marcelo Vanzin + 2015-05-01 21:20:46 +0100 + Commit: 7b5dd3e, github.com/apache/spark/pull/5813 + + [SPARK-7213] [YARN] Check for read permissions before copying a Hadoop config file + Nishkam Ravi , nishkamravi2 , nravi + 2015-05-01 21:14:16 +0100 + Commit: f53a488, github.com/apache/spark/pull/5760 + + Revert "[SPARK-7224] added mock repository generator for --packages tests" + Patrick Wendell + 2015-05-01 13:01:43 -0700 + Commit: c6d9a42 + + Revert "[SPARK-7287] enabled fixed test" + Patrick Wendell + 2015-05-01 13:01:14 -0700 + Commit: 58d6584 + + [SPARK-7274] [SQL] Create Column expression for array/struct creation. + Reynold Xin + 2015-05-01 12:49:02 -0700 + Commit: 3753776, github.com/apache/spark/pull/5802 + + [SPARK-7183] [NETWORK] Fix memory leak of TransportRequestHandler.streamIds + Liang-Chi Hsieh + 2015-05-01 11:59:12 -0700 + Commit: 1686032, github.com/apache/spark/pull/5743 + + [SPARK-6846] [WEBUI] [HOTFIX] return to GET for kill link in UI since YARN AM won't proxy POST + Sean Owen + 2015-05-01 19:57:37 +0100 + Commit: 1262e31, github.com/apache/spark/pull/5837 + + [SPARK-5854] personalized page rank + Dan McClary , dwmclary + 2015-05-01 11:55:43 -0700 + Commit: 7d42722, github.com/apache/spark/pull/4774 + + changing persistence engine trait to an abstract class + niranda + 2015-05-01 11:27:45 -0700 + Commit: 27de6fe, github.com/apache/spark/pull/5832 + + Limit help option regex + Chris Biow + 2015-05-01 19:26:55 +0100 + Commit: c8c481d, github.com/apache/spark/pull/5816 + + [SPARK-5891] [ML] Add Binarizer ML Transformer + Liang-Chi Hsieh + 2015-05-01 08:31:01 -0700 + Commit: 7630213, github.com/apache/spark/pull/5699 + + [SPARK-3066] [MLLIB] Support recommendAll in matrix factorization model + Debasish Das , Xiangrui Meng + 2015-05-01 08:27:46 -0700 + Commit: 3b514af, github.com/apache/spark/pull/3098 + + [SPARK-4705] Handle multiple app attempts event logs, history server. + Marcelo Vanzin , twinkle sachdeva , twinkle.sachdeva , twinkle sachdeva + 2015-05-01 09:50:55 -0500 + Commit: 3052f49, github.com/apache/spark/pull/5432 + + [SPARK-3468] [WEBUI] Timeline-View feature + Kousuke Saruta + 2015-05-01 01:39:56 -0700 + Commit: 7fe0f3f, github.com/apache/spark/pull/2342 + + [SPARK-6257] [PYSPARK] [MLLIB] MLlib API missing items in Recommendation + MechCoder + 2015-04-30 23:51:00 -0700 + Commit: c24aeb6, github.com/apache/spark/pull/5807 + + [SPARK-7291] [CORE] Fix a flaky test in AkkaRpcEnvSuite + zsxwing + 2015-04-30 23:44:33 -0700 + Commit: 14b3288, github.com/apache/spark/pull/5822 + + [SPARK-7287] enabled fixed test + Burak Yavuz + 2015-04-30 23:39:58 -0700 + Commit: 7cf1eb7, github.com/apache/spark/pull/5826 + + [SPARK-4550] In sort-based shuffle, store map outputs in serialized form + Sandy Ryza + 2015-04-30 23:14:14 -0700 + Commit: 0a2b15c, github.com/apache/spark/pull/4450 + + HOTFIX: Disable buggy dependency checker + Patrick Wendell + 2015-04-30 22:39:58 -0700 + Commit: a9fc505 + + [SPARK-6479] [BLOCK MANAGER] Create off-heap block storage API + Zhan Zhang + 2015-04-30 22:24:31 -0700 + Commit: 36a7a68, github.com/apache/spark/pull/5430 + + [SPARK-7248] implemented random number generators for DataFrames + Burak Yavuz + 2015-04-30 21:56:03 -0700 + Commit: b5347a4, github.com/apache/spark/pull/5819 + + [SPARK-7282] [STREAMING] Fix the race conditions in StreamingListenerSuite + zsxwing + 2015-04-30 21:32:11 -0700 + Commit: 69a739c, github.com/apache/spark/pull/5812 + + Revert "[SPARK-5213] [SQL] Pluggable SQL Parser Support" + Patrick Wendell + 2015-04-30 20:33:36 -0700 + Commit: beeafcf + + [SPARK-7123] [SQL] support table.star in sqlcontext + scwf + 2015-04-30 18:50:14 -0700 + Commit: 473552f, github.com/apache/spark/pull/5690 + + [SPARK-5213] [SQL] Pluggable SQL Parser Support + Cheng Hao + 2015-04-30 18:49:06 -0700 + Commit: 3ba5aaa, github.com/apache/spark/pull/4015 + + [SPARK-6913][SQL] Fixed "java.sql.SQLException: No suitable driver found" + Vyacheslav Baranov + 2015-04-30 18:45:14 -0700 + Commit: e991255, github.com/apache/spark/pull/5782 + + [SPARK-7109] [SQL] Push down left side filter for left semi join + wangfei , scwf + 2015-04-30 18:18:54 -0700 + Commit: a0d8a61, github.com/apache/spark/pull/5677 + + [SPARK-7093] [SQL] Using newPredicate in NestedLoopJoin to enable code generation + scwf + 2015-04-30 18:15:56 -0700 + Commit: 0797338, github.com/apache/spark/pull/5665 + + [SPARK-7280][SQL] Add "drop" column/s on a data frame + rakeshchalasani + 2015-04-30 17:42:50 -0700 + Commit: ee04413, github.com/apache/spark/pull/5818 + + [SPARK-7242][SQL][MLLIB] Frequent items for DataFrames + Burak Yavuz + 2015-04-30 16:40:32 -0700 + Commit: 149b3ee, github.com/apache/spark/pull/5799 + + [SPARK-7279] Removed diffSum which is theoretical zero in LinearRegression and coding formating + DB Tsai + 2015-04-30 16:26:51 -0700 + Commit: 1c3e402, github.com/apache/spark/pull/5809 + + [Build] Enable MiMa checks for SQL + Josh Rosen + 2015-04-30 16:23:01 -0700 + Commit: fa01bec, github.com/apache/spark/pull/5727 + + [SPARK-7267][SQL]Push down Project when it's child is Limit + Zhongshuai Pei <799203320@qq.com>, DoingDone9 <799203320@qq.com> + 2015-04-30 15:22:13 -0700 + Commit: 77cc25f, github.com/apache/spark/pull/5797 + + [SPARK-7288] Suppress compiler warnings due to use of sun.misc.Unsafe; add facade in front of Unsafe; remove use of Unsafe.setMemory + Josh Rosen + 2015-04-30 15:21:00 -0700 + Commit: 07a8620, github.com/apache/spark/pull/5814 + + [SPARK-7196][SQL] Support precision and scale of decimal type for JDBC + Liang-Chi Hsieh + 2015-04-30 15:13:43 -0700 + Commit: 6702324, github.com/apache/spark/pull/5777 + + Revert "[SPARK-5342] [YARN] Allow long running Spark apps to run on secure YARN/HDFS" + Patrick Wendell + 2015-04-30 14:59:20 -0700 + Commit: e0628f2 + + [SPARK-7207] [ML] [BUILD] Added ml.recommendation, ml.regression to SparkBuild + Joseph K. Bradley + 2015-04-30 14:39:27 -0700 + Commit: adbdb19, github.com/apache/spark/pull/5758 + + [SPARK-5342] [YARN] Allow long running Spark apps to run on secure YARN/HDFS + Hari Shreedharan + 2015-04-30 13:03:23 -0500 + Commit: 6c65da6, github.com/apache/spark/pull/4688 + + [SPARK-7224] added mock repository generator for --packages tests + Burak Yavuz + 2015-04-30 10:19:08 -0700 + Commit: 7dacc08, github.com/apache/spark/pull/5790 + + [HOTFIX] Disabling flaky test (fix in progress as part of SPARK-7224) + Patrick Wendell + 2015-04-30 01:02:33 -0700 + Commit: 47bf406 + + [SPARK-1406] Mllib pmml model export + Vincenzo Selvaggio , Xiangrui Meng , selvinsource + 2015-04-29 23:21:21 -0700 + Commit: 254e050, github.com/apache/spark/pull/3062 + + [SPARK-7225][SQL] CombineLimits optimizer does not work + Zhongshuai Pei <799203320@qq.com>, DoingDone9 <799203320@qq.com> + 2015-04-29 22:44:14 -0700 + Commit: 4459514, github.com/apache/spark/pull/5770 + + Some code clean up. + DB Tsai + 2015-04-29 21:44:41 -0700 + Commit: ba49eb1, github.com/apache/spark/pull/5794 + + [SPARK-7156][SQL] Addressed follow up comments for randomSplit + Burak Yavuz + 2015-04-29 19:13:47 -0700 + Commit: 5553198, github.com/apache/spark/pull/5795 + + [SPARK-7234][SQL] Fix DateType mismatch when codegen on. + 云峤 + 2015-04-29 18:23:42 -0700 + Commit: 7143f6e, github.com/apache/spark/pull/5778 + + [SPARK-6862] [STREAMING] [WEBUI] Add BatchPage to display details of a batch + zsxwing + 2015-04-29 18:22:14 -0700 + Commit: 1b7106b, github.com/apache/spark/pull/5473 + + [SPARK-7176] [ML] Add validation functionality to Param + Joseph K. Bradley + 2015-04-29 17:26:46 -0700 + Commit: 114bad6, github.com/apache/spark/pull/5740 + + [SQL] [Minor] Print detail query execution info when spark answer is not right + wangfei + 2015-04-29 17:00:24 -0700 + Commit: 1fdfdb4, github.com/apache/spark/pull/5774 + + [SPARK-7259] [ML] VectorIndexer: do not copy non-ML metadata to output column + Joseph K. Bradley + 2015-04-29 16:35:17 -0700 + Commit: b1ef6a6, github.com/apache/spark/pull/5789 + + [SPARK-7229] [SQL] SpecificMutableRow should take integer type as internal representation for Date + Cheng Hao + 2015-04-29 16:23:34 -0700 + Commit: f8cbb0a, github.com/apache/spark/pull/5772 + + [SPARK-7155] [CORE] Allow newAPIHadoopFile to support comma-separated list of files as input + yongtang + 2015-04-29 23:55:51 +0100 + Commit: 3fc6cfd, github.com/apache/spark/pull/5708 + + [SPARK-7181] [CORE] fix inifite loop in Externalsorter's mergeWithAggregation + Qiping Li + 2015-04-29 23:52:16 +0100 + Commit: 7f4b583, github.com/apache/spark/pull/5737 + + [SPARK-7156][SQL] support RandomSplit in DataFrames + Burak Yavuz + 2015-04-29 15:34:05 -0700 + Commit: d7dbce8, github.com/apache/spark/pull/5761 + + [SPARK-6529] [ML] Add Word2Vec transformer + Xusen Yin + 2015-04-29 14:55:32 -0700 + Commit: c9d530e, github.com/apache/spark/pull/5596 + + [SPARK-7222] [ML] Added mathematical derivation in comment and compressed the model, removed the correction terms in LinearRegression with ElasticNet + DB Tsai + 2015-04-29 14:53:37 -0700 + Commit: 15995c8, github.com/apache/spark/pull/5767 + + [SPARK-6629] cancelJobGroup() may not work for jobs whose job groups are inherited from parent threads + Josh Rosen + 2015-04-29 13:31:52 -0700 + Commit: 3a180c1, github.com/apache/spark/pull/5288 + + [SPARK-6752] [STREAMING] [REOPENED] Allow StreamingContext to be recreated from checkpoint and existing SparkContext + Tathagata Das + 2015-04-29 13:10:31 -0700 + Commit: a9c4e29, github.com/apache/spark/pull/5773 + + [SPARK-7056] [STREAMING] Make the Write Ahead Log pluggable + Tathagata Das + 2015-04-29 13:06:11 -0700 + Commit: 1868bd4, github.com/apache/spark/pull/5645 + + Fix a typo of "threshold" + Xusen Yin + 2015-04-29 10:13:48 -0700 + Commit: c0c0ba6, github.com/apache/spark/pull/5769 + + [SQL][Minor] fix java doc for DataFrame.agg + Wenchen Fan + 2015-04-29 09:49:24 -0700 + Commit: 81ea42b, github.com/apache/spark/pull/5712 + + Better error message on access to non-existing attribute + ksonj + 2015-04-29 09:48:47 -0700 + Commit: 3df9c5d, github.com/apache/spark/pull/5771 + + [SPARK-7223] Rename RPC askWithReply -> askWithReply, sendWithReply -> ask. + Reynold Xin + 2015-04-29 09:46:37 -0700 + Commit: 687273d, github.com/apache/spark/pull/5768 + + [SPARK-6918] [YARN] Secure HBase support. + Dean Chen + 2015-04-29 08:58:33 -0500 + Commit: baed3f2, github.com/apache/spark/pull/5586 + + [SPARK-7076][SPARK-7077][SPARK-7080][SQL] Use managed memory for aggregations + Josh Rosen + 2015-04-29 01:07:26 -0700 + Commit: f49284b, github.com/apache/spark/pull/5725 + + [SPARK-7204] [SQL] Fix callSite for Dataframe and SQL operations + Patrick Wendell + 2015-04-29 00:35:08 -0700 + Commit: 1fd6ed9, github.com/apache/spark/pull/5757 + + [SPARK-7188] added python support for math DataFrame functions + Burak Yavuz + 2015-04-29 00:09:24 -0700 + Commit: fe917f5, github.com/apache/spark/pull/5750 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-04-28 23:38:59 -0700 + Commit: 8dee274, github.com/apache/spark/pull/3205 + + [SPARK-7205] Support `.ivy2/local` and `.m2/repositories/` in --packages + Burak Yavuz + 2015-04-28 23:05:02 -0700 + Commit: f98773a, github.com/apache/spark/pull/5755 + + [SPARK-7215] made coalesce and repartition a part of the query plan + Burak Yavuz + 2015-04-28 22:48:04 -0700 + Commit: 271c4c6, github.com/apache/spark/pull/5762 + + [SPARK-6756] [MLLIB] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector + Xiangrui Meng + 2015-04-28 21:49:53 -0700 + Commit: 5ef006f, github.com/apache/spark/pull/5756 + + [SPARK-7208] [ML] [PYTHON] Added Matrix, SparseMatrix to __all__ list in linalg.py + Joseph K. Bradley + 2015-04-28 21:15:47 -0700 + Commit: a8aeadb, github.com/apache/spark/pull/5759 + + [SPARK-7138] [STREAMING] Add method to BlockGenerator to add multiple records to BlockGenerator with single callback + Tathagata Das + 2015-04-28 19:31:57 -0700 + Commit: 5c8f4bd, github.com/apache/spark/pull/5695 + + [SPARK-6965] [MLLIB] StringIndexer handles numeric input. + Xiangrui Meng + 2015-04-28 17:41:09 -0700 + Commit: d36e673, github.com/apache/spark/pull/5753 + + Closes #4807 Closes #5055 Closes #3583 + Xiangrui Meng + 2015-04-28 14:21:25 -0700 + Commit: 555213e + + [SPARK-7201] [MLLIB] move Identifiable to ml.util + Xiangrui Meng + 2015-04-28 14:07:26 -0700 + Commit: f0a1f90, github.com/apache/spark/pull/5749 + + [MINOR] [CORE] Warn users who try to cache RDDs with dynamic allocation on. + Marcelo Vanzin + 2015-04-28 13:49:29 -0700 + Commit: 28b1af7, github.com/apache/spark/pull/5751 + + [SPARK-5338] [MESOS] Add cluster mode support for Mesos + Timothy Chen , Luc Bourlier + 2015-04-28 13:31:08 -0700 + Commit: 53befac, github.com/apache/spark/pull/5144 + + [SPARK-6314] [CORE] handle JsonParseException for history server + Zhang, Liye + 2015-04-28 12:33:48 -0700 + Commit: 8009810, github.com/apache/spark/pull/5736 + + [SPARK-5932] [CORE] Use consistent naming for size properties + Ilya Ganelin + 2015-04-28 12:18:55 -0700 + Commit: 2d222fb, github.com/apache/spark/pull/5574 + + [SPARK-4286] Add an external shuffle service that can be run as a daemon. + Iulian Dragos + 2015-04-28 12:08:18 -0700 + Commit: 8aab94d, github.com/apache/spark/pull/4990 + + [Core][test][minor] replace try finally block with tryWithSafeFinally + Zhang, Liye + 2015-04-28 10:24:00 -0700 + Commit: 52ccf1d, github.com/apache/spark/pull/5739 + + [SPARK-7140] [MLLIB] only scan the first 16 entries in Vector.hashCode + Xiangrui Meng + 2015-04-28 09:59:36 -0700 + Commit: b14cd23, github.com/apache/spark/pull/5697 + + [SPARK-5253] [ML] LinearRegression with L1/L2 (ElasticNet) using OWLQN + DB Tsai , DB Tsai + 2015-04-28 09:46:08 -0700 + Commit: 6a827d5, github.com/apache/spark/pull/4259 + + [SPARK-6435] spark-shell --jars option does not add all jars to classpath + Masayoshi TSUZUKI + 2015-04-28 07:55:21 -0400 + Commit: 268c419, github.com/apache/spark/pull/5227 + + [SPARK-7100] [MLLIB] Fix persisted RDD leak in GradientBoostTrees + Jim Carroll + 2015-04-28 07:51:02 -0400 + Commit: 75905c5, github.com/apache/spark/pull/5669 + + [SPARK-7168] [BUILD] Update plugin versions in Maven build and centralize versions + Sean Owen + 2015-04-28 07:48:34 -0400 + Commit: 7f3b3b7, github.com/apache/spark/pull/5720 + + [SPARK-6352] [SQL] Custom parquet output committer + Pei-Lun Lee + 2015-04-28 16:50:18 +0800 + Commit: e13cd86, github.com/apache/spark/pull/5525 + + [SPARK-7135][SQL] DataFrame expression for monotonically increasing IDs. + Reynold Xin + 2015-04-28 00:39:08 -0700 + Commit: d94cd1a, github.com/apache/spark/pull/5709 + + [SPARK-7187] SerializationDebugger should not crash user code + Andrew Or + 2015-04-28 00:38:14 -0700 + Commit: bf35edd, github.com/apache/spark/pull/5734 + + [SPARK-5946] [STREAMING] Add Python API for direct Kafka stream + jerryshao , Saisai Shao + 2015-04-27 23:48:02 -0700 + Commit: 9e4e82b, github.com/apache/spark/pull/4723 + + [SPARK-6829] Added math functions for DataFrames + Burak Yavuz + 2015-04-27 23:10:14 -0700 + Commit: 29576e7, github.com/apache/spark/pull/5616 + + [SPARK-7174][Core] Move calling `TaskScheduler.executorHeartbeatReceived` to another thread + zsxwing + 2015-04-27 21:45:40 -0700 + Commit: 874a2ca, github.com/apache/spark/pull/5723 + + [SPARK-7090] [MLLIB] Introduce LDAOptimizer to LDA to further improve extensibility + Yuhao Yang + 2015-04-27 19:02:51 -0700 + Commit: 4d9e560, github.com/apache/spark/pull/5661 + + [SPARK-7162] [YARN] Launcher error in yarn-client + GuoQiang Li + 2015-04-27 19:52:41 -0400 + Commit: 62888a4, github.com/apache/spark/pull/5716 + + [SPARK-7145] [CORE] commons-lang (2.x) classes used instead of commons-lang3 (3.x); commons-io used without dependency + Sean Owen + 2015-04-27 19:50:55 -0400 + Commit: ab5adb7, github.com/apache/spark/pull/5703 + + [SPARK-3090] [CORE] Stop SparkContext if user forgets to. + Marcelo Vanzin + 2015-04-27 19:46:17 -0400 + Commit: 5d45e1f, github.com/apache/spark/pull/5696 + + [SPARK-6738] [CORE] Improve estimate the size of a large array + Hong Shen + 2015-04-27 18:57:31 -0400 + Commit: 8e1c00d, github.com/apache/spark/pull/5608 + + [SPARK-7103] Fix crash with SparkContext.union when RDD has no partitioner + Steven She + 2015-04-27 18:55:02 -0400 + Commit: b9de9e0, github.com/apache/spark/pull/5679 + + [SPARK-6991] [SPARKR] Adds support for zipPartitions. + hlin09 + 2015-04-27 15:04:37 -0700 + Commit: ca9f4eb, github.com/apache/spark/pull/5568 + + SPARK-7107 Add parameter for zookeeper.znode.parent to hbase_inputformat... + tedyu + 2015-04-27 14:42:40 -0700 + Commit: ef82bdd, github.com/apache/spark/pull/5673 + + [SPARK-6856] [R] Make RDD information more useful in SparkR + Jeff Harrison + 2015-04-27 13:38:25 -0700 + Commit: 7078f60, github.com/apache/spark/pull/5667 + + [SPARK-4925] Publish Spark SQL hive-thriftserver maven artifact + Misha Chernetsov + 2015-04-27 11:27:56 -0700 + Commit: 998aac2, github.com/apache/spark/pull/5429 + + [SPARK-6505] [SQL] Remove the reflection call in HiveFunctionWrapper + baishuo + 2015-04-27 14:08:05 +0800 + Commit: 82bb7fd, github.com/apache/spark/pull/5660 + + [SQL][Minor] rename DataTypeParser.apply to DataTypeParser.parse + wangfei + 2015-04-26 21:08:47 -0700 + Commit: d188b8b, github.com/apache/spark/pull/5710 + + [SPARK-7152][SQL] Add a Column expression for partition ID. + Reynold Xin + 2015-04-26 11:46:58 -0700 + Commit: ca55dc9, github.com/apache/spark/pull/5705 + + [MINOR] [MLLIB] Refactor toString method in MLLIB + Alain + 2015-04-26 07:14:24 -0400 + Commit: 9a5bbe0, github.com/apache/spark/pull/5687 + + [SPARK-6014] [CORE] [HOTFIX] Add try-catch block around ShutDownHook + Nishkam Ravi , nishkamravi2 , nravi + 2015-04-25 20:02:23 -0400 + Commit: f5473c2, github.com/apache/spark/pull/5672 + + [SPARK-7092] Update spark scala version to 2.11.6 + Prashant Sharma + 2015-04-25 18:07:34 -0400 + Commit: a11c868, github.com/apache/spark/pull/5662 + + [SQL] Update SQL readme to include instructions on generating golden answer files based on Hive 0.13.1. + Yin Huai + 2015-04-25 13:43:39 -0700 + Commit: aa6966f, github.com/apache/spark/pull/5702 + + [SPARK-6113] [ML] Tree ensembles for Pipelines API + Joseph K. Bradley + 2015-04-25 12:27:19 -0700 + Commit: a7160c4, github.com/apache/spark/pull/5626 + + Revert "[SPARK-6752][Streaming] Allow StreamingContext to be recreated from checkpoint and existing SparkContext" + Patrick Wendell + 2015-04-25 10:37:34 -0700 + Commit: a61d65f + + update the deprecated CountMinSketchMonoid function to TopPctCMS function + KeheCAI + 2015-04-25 08:42:38 -0400 + Commit: cca9905, github.com/apache/spark/pull/5629 + + [SPARK-7136][Docs] Spark SQL and DataFrame Guide fix example file and paths + Deborah Siegel , DEBORAH SIEGEL , DEBORAH SIEGEL , DEBORAH SIEGEL + 2015-04-24 20:25:07 -0700 + Commit: 59b7cfc, github.com/apache/spark/pull/5693 + + [PySpark][Minor] Update sql example, so that can read file correctly + linweizhong + 2015-04-24 20:23:19 -0700 + Commit: d874f8b, github.com/apache/spark/pull/5684 + + [SPARK-6122] [CORE] Upgrade tachyon-client version to 0.6.3 + Calvin Jia + 2015-04-24 17:57:41 -0400 + Commit: 438859e, github.com/apache/spark/pull/5354 + + [SPARK-6852] [SPARKR] Accept numeric as numPartitions in SparkR. + Sun Rui + 2015-04-24 12:52:07 -0700 + Commit: caf0136, github.com/apache/spark/pull/5613 + + [SPARK-7033] [SPARKR] Clean usage of split. Use partition instead where applicable. + Sun Rui + 2015-04-24 11:00:19 -0700 + Commit: ebb77b2, github.com/apache/spark/pull/5628 + + [SPARK-6528] [ML] Add IDF transformer + Xusen Yin + 2015-04-24 08:29:49 -0700 + Commit: 6e57d57, github.com/apache/spark/pull/5266 + + [SPARK-7115] [MLLIB] skip the very first 1 in poly expansion + Xiangrui Meng + 2015-04-24 08:27:48 -0700 + Commit: 78b39c7, github.com/apache/spark/pull/5681 + + [SPARK-5894] [ML] Add polynomial mapper + Xusen Yin , Xiangrui Meng + 2015-04-24 00:39:29 -0700 + Commit: 8509519, github.com/apache/spark/pull/5245 + + Fixed a typo from the previous commit. + Reynold Xin + 2015-04-23 22:39:00 -0700 + Commit: 4c722d7 + + [SQL] Fixed expression data type matching. + Reynold Xin + 2015-04-23 21:21:03 -0700 + Commit: d3a302d, github.com/apache/spark/pull/5675 + + Update sql-programming-guide.md + Ken Geis + 2015-04-23 20:45:33 -0700 + Commit: 67bccbd, github.com/apache/spark/pull/5674 + + [SPARK-7060][SQL] Add alias function to python dataframe + Yin Huai + 2015-04-23 18:52:55 -0700 + Commit: 2d010f7, github.com/apache/spark/pull/5634 + + [SPARK-7037] [CORE] Inconsistent behavior for non-spark config properties in spark-shell and spark-submit + Cheolsoo Park + 2015-04-23 20:10:55 -0400 + Commit: 336f7f5, github.com/apache/spark/pull/5617 + + [SPARK-6818] [SPARKR] Support column deletion in SparkR DataFrame API. + Sun Rui + 2015-04-23 16:08:14 -0700 + Commit: 73db132, github.com/apache/spark/pull/5655 + + [SQL] Break dataTypes.scala into multiple files. + Reynold Xin + 2015-04-23 14:48:19 -0700 + Commit: 6220d93, github.com/apache/spark/pull/5670 + + [SPARK-7070] [MLLIB] LDA.setBeta should call setTopicConcentration. + Xiangrui Meng + 2015-04-23 14:46:54 -0700 + Commit: 1ed46a6, github.com/apache/spark/pull/5649 + + [SPARK-7087] [BUILD] Fix path issue change version script + Tijo Thomas + 2015-04-23 17:23:15 -0400 + Commit: 6d0749c, github.com/apache/spark/pull/5656 + + [SPARK-6879] [HISTORYSERVER] check if app is completed before clean it up + WangTaoTheTonic + 2015-04-23 17:20:17 -0400 + Commit: baa83a9, github.com/apache/spark/pull/5491 + + [SPARK-7085][MLlib] Fix miniBatchFraction parameter in train method called with 4 arguments + wizz + 2015-04-23 14:00:07 -0700 + Commit: 3e91cc2, github.com/apache/spark/pull/5658 + + [SPARK-7058] Include RDD deserialization time in "task deserialization time" metric + Josh Rosen + 2015-04-23 13:19:03 -0700 + Commit: 6afde2c, github.com/apache/spark/pull/5635 + + [SPARK-7055][SQL]Use correct ClassLoader for JDBC Driver in JDBCRDD.getConnector + Vinod K C + 2015-04-23 12:00:23 -0700 + Commit: c1213e6, github.com/apache/spark/pull/5633 + + [SPARK-6752][Streaming] Allow StreamingContext to be recreated from checkpoint and existing SparkContext + Tathagata Das + 2015-04-23 11:29:34 -0700 + Commit: 534f2a4, github.com/apache/spark/pull/5428 + + [SPARK-7044] [SQL] Fix the deadlock in script transformation + Cheng Hao + 2015-04-23 10:35:22 -0700 + Commit: cc48e63, github.com/apache/spark/pull/5625 + + [minor][streaming]fixed scala string interpolation error + Prabeesh K + 2015-04-23 10:33:13 -0700 + Commit: 975f53e, github.com/apache/spark/pull/5653 + + [HOTFIX] [SQL] Fix compilation for scala 2.11. + Prashant Sharma + 2015-04-23 16:45:26 +0530 + Commit: a7d65d3, github.com/apache/spark/pull/5652 + + [SPARK-7069][SQL] Rename NativeType -> AtomicType. + Reynold Xin + 2015-04-23 01:43:40 -0700 + Commit: f60bece, github.com/apache/spark/pull/5651 + + [SPARK-7068][SQL] Remove PrimitiveType + Reynold Xin + 2015-04-22 23:55:20 -0700 + Commit: 29163c5, github.com/apache/spark/pull/5646 + + [MLlib] Add support for BooleanType to VectorAssembler. + Reynold Xin + 2015-04-22 23:54:48 -0700 + Commit: 2d33323, github.com/apache/spark/pull/5648 + + [HOTFIX][SQL] Fix broken cached test + Liang-Chi Hsieh + 2015-04-22 22:18:56 -0700 + Commit: d9e70f3, github.com/apache/spark/pull/5640 + + [SPARK-7046] Remove InputMetrics from BlockResult + Kay Ousterhout + 2015-04-22 21:42:09 -0700 + Commit: 03e85b4, github.com/apache/spark/pull/5627 + + [SPARK-7066][MLlib] VectorAssembler should use NumericType not NativeType. + Reynold Xin + 2015-04-22 21:35:42 -0700 + Commit: d206860, github.com/apache/spark/pull/5642 + + [MLlib] UnaryTransformer nullability should not depend on PrimitiveType. + Reynold Xin + 2015-04-22 21:35:12 -0700 + Commit: 1b85e08, github.com/apache/spark/pull/5644 + + Disable flaky test: ReceiverSuite "block generator throttling". + Reynold Xin + 2015-04-22 21:24:22 -0700 + Commit: b69c4f9 + + [SPARK-6967] [SQL] fix date type convertion in jdbcrdd + Daoyuan Wang + 2015-04-22 19:14:28 -0700 + Commit: 04525c0, github.com/apache/spark/pull/5590 + + [SPARK-6827] [MLLIB] Wrap FPGrowthModel.freqItemsets and make it consistent with Java API + Yanbo Liang + 2015-04-22 17:22:26 -0700 + Commit: f4f3998, github.com/apache/spark/pull/5614 + + [SPARK-7059][SQL] Create a DataFrame join API to facilitate equijoin. + Reynold Xin + 2015-04-22 15:26:58 -0700 + Commit: baf865d, github.com/apache/spark/pull/5638 + + [SPARK-7039][SQL]JDBCRDD: Add support on type NVARCHAR + szheng79 + 2015-04-22 13:02:55 -0700 + Commit: fbe7106, github.com/apache/spark/pull/5618 + + [SQL] Rename some apply functions. + Reynold Xin + 2015-04-22 11:18:01 -0700 + Commit: cdf0328, github.com/apache/spark/pull/5624 + + [SPARK-7052][Core] Add ThreadUtils and move thread methods from Utils to ThreadUtils + zsxwing + 2015-04-22 11:08:59 -0700 + Commit: 33b8562, github.com/apache/spark/pull/5631 + + [SPARK-6889] [DOCS] CONTRIBUTING.md updates to accompany contribution doc updates + Sean Owen + 2015-04-21 22:34:31 -0700 + Commit: bdc5c16, github.com/apache/spark/pull/5623 + + [SPARK-6113] [ML] Small cleanups after original tree API PR + Joseph K. Bradley + 2015-04-21 21:44:44 -0700 + Commit: 607eff0, github.com/apache/spark/pull/5567 + + [MINOR] Comment improvements in ExternalSorter. + Patrick Wendell + 2015-04-21 21:04:04 -0700 + Commit: 70f9f8f, github.com/apache/spark/pull/5620 + + [SPARK-6490][Docs] Add docs for rpc configurations + zsxwing + 2015-04-21 18:37:53 -0700 + Commit: 3a3f710, github.com/apache/spark/pull/5607 + + [SPARK-1684] [PROJECT INFRA] Merge script should standardize SPARK-XXX prefix + texasmichelle + 2015-04-21 18:08:29 -0700 + Commit: a0761ec, github.com/apache/spark/pull/5149 + + Closes #5427 + Reynold Xin + 2015-04-21 17:52:52 -0700 + Commit: 41ef78a + + [SPARK-6953] [PySpark] speed up python tests + Reynold Xin , Xiangrui Meng + 2015-04-21 17:49:55 -0700 + Commit: 3134c3f, github.com/apache/spark/pull/5605 + + [SPARK-6014] [core] Revamp Spark shutdown hooks, fix shutdown races. + Marcelo Vanzin + 2015-04-21 20:33:57 -0400 + Commit: e72c16e, github.com/apache/spark/pull/5560 + + Avoid warning message about invalid refuse_seconds value in Mesos >=0.21... + mweindel + 2015-04-21 20:19:33 -0400 + Commit: b063a61, github.com/apache/spark/pull/5597 + + [Minor][MLLIB] Fix a minor formatting bug in toString method in Node.scala + Alain + 2015-04-21 16:46:17 -0700 + Commit: ae036d0, github.com/apache/spark/pull/5621 + + [SPARK-7036][MLLIB] ALS.train should support DataFrames in PySpark + Xiangrui Meng + 2015-04-21 16:44:52 -0700 + Commit: 686dd74, github.com/apache/spark/pull/5619 + + [SPARK-6065] [MLlib] Optimize word2vec.findSynonyms using blas calls + MechCoder + 2015-04-21 16:42:45 -0700 + Commit: 7fe6142, github.com/apache/spark/pull/5467 + + [minor] [build] Set java options when generating mima ignores. + Marcelo Vanzin + 2015-04-21 16:35:37 -0700 + Commit: a70e849, github.com/apache/spark/pull/5615 + + [SPARK-3386] Share and reuse SerializerInstances in shuffle paths + Josh Rosen + 2015-04-21 16:24:15 -0700 + Commit: f83c0f1, github.com/apache/spark/pull/5606 + + [SPARK-5817] [SQL] Fix bug of udtf with column names + Cheng Hao + 2015-04-21 15:11:15 -0700 + Commit: 7662ec2, github.com/apache/spark/pull/4602 + + [SPARK-6996][SQL] Support map types in java beans + Punya Biswal + 2015-04-21 14:50:02 -0700 + Commit: 2a24bf9, github.com/apache/spark/pull/5578 + + [SPARK-6969][SQL] Refresh the cached table when REFRESH TABLE is used + Yin Huai + 2015-04-21 14:48:42 -0700 + Commit: 6265cba, github.com/apache/spark/pull/5583 + + [SQL][minor] make it more clear that we only need to re-throw GetField exception for UnresolvedAttribute + Wenchen Fan + 2015-04-21 14:48:02 -0700 + Commit: 03fd921, github.com/apache/spark/pull/5588 + + [SPARK-6994] Allow to fetch field values by name in sql.Row + vidmantas zemleris + 2015-04-21 14:47:09 -0700 + Commit: 2e8c6ca, github.com/apache/spark/pull/5573 + + [SPARK-7011] Build(compilation) fails with scala 2.11 option, because a protected[sql] type is accessed in ml package. + Prashant Sharma + 2015-04-21 14:43:46 -0700 + Commit: 04bf34e, github.com/apache/spark/pull/5593 + + [SPARK-6845] [MLlib] [PySpark] Add isTranposed flag to DenseMatrix + MechCoder + 2015-04-21 14:36:50 -0700 + Commit: 45c47fa, github.com/apache/spark/pull/5455 + + SPARK-3276 Added a new configuration spark.streaming.minRememberDuration + emres + 2015-04-21 16:39:56 -0400 + Commit: c25ca7c, github.com/apache/spark/pull/5438 + + [SPARK-5360] [SPARK-6606] Eliminate duplicate objects in serialized CoGroupedRDD + Kay Ousterhout + 2015-04-21 11:01:18 -0700 + Commit: c035c0f, github.com/apache/spark/pull/4145 + + [SPARK-6985][streaming] Receiver maxRate over 1000 causes a StackOverflowError + David McGuire + 2015-04-21 07:21:10 -0400 + Commit: 5fea3e5, github.com/apache/spark/pull/5559 + + [SPARK-5990] [MLLIB] Model import/export for IsotonicRegression + Yanbo Liang + 2015-04-21 00:14:16 -0700 + Commit: 1f2f723, github.com/apache/spark/pull/5270 + + [SPARK-6949] [SQL] [PySpark] Support Date/Timestamp in Column expression + Davies Liu + 2015-04-21 00:08:18 -0700 + Commit: ab9128f, github.com/apache/spark/pull/5570 + + [SPARK-6490][Core] Add spark.rpc.* and deprecate spark.akka.* + zsxwing + 2015-04-20 23:18:42 -0700 + Commit: 8136810, github.com/apache/spark/pull/5595 + + [SPARK-6635][SQL] DataFrame.withColumn should replace columns with identical column names + Liang-Chi Hsieh + 2015-04-20 18:54:01 -0700 + Commit: c736220, github.com/apache/spark/pull/5541 + + [SPARK-6368][SQL] Build a specialized serializer for Exchange operator. + Yin Huai + 2015-04-20 18:42:50 -0700 + Commit: ce7ddab, github.com/apache/spark/pull/5497 + + [doc][streaming] Fixed broken link in mllib section + BenFradet + 2015-04-20 13:46:55 -0700 + Commit: 517bdf3, github.com/apache/spark/pull/5600 + + fixed doc + Eric Chiang + 2015-04-20 13:11:21 -0700 + Commit: 97fda73, github.com/apache/spark/pull/5599 + + [Minor][MLlib] Incorrect path to test data is used in DecisionTreeExample + Liang-Chi Hsieh + 2015-04-20 10:47:37 -0700 + Commit: 1ebceaa, github.com/apache/spark/pull/5594 + + [SPARK-6661] Python type errors should print type, not object + Elisey Zanko + 2015-04-20 10:44:09 -0700 + Commit: 7717661, github.com/apache/spark/pull/5361 + + [SPARK-7003] Improve reliability of connection failure detection between Netty block transfer service endpoints + Aaron Davidson + 2015-04-20 09:54:21 -0700 + Commit: 968ad97, github.com/apache/spark/pull/5584 + + [SPARK-5924] Add the ability to specify withMean or withStd parameters with StandarScaler + jrabary + 2015-04-20 09:47:56 -0700 + Commit: 1be2070, github.com/apache/spark/pull/4704 + + [doc][mllib] Fix typo of the page title in Isotonic regression documents + dobashim + 2015-04-20 00:03:23 -0400 + Commit: 6fe690d, github.com/apache/spark/pull/5581 + + [SPARK-6979][Streaming] Replace JobScheduler.eventActor and JobGenerator.eventActor with EventLoop + zsxwing + 2015-04-19 20:48:36 -0700 + Commit: c776ee8, github.com/apache/spark/pull/5554 + + [SPARK-6983][Streaming] Update ReceiverTrackerActor to use the new Rpc interface + zsxwing + 2015-04-19 20:35:43 -0700 + Commit: d8e1b7b, github.com/apache/spark/pull/5557 + + [SPARK-6998][MLlib] Make StreamingKMeans 'Serializable' + zsxwing + 2015-04-19 20:33:51 -0700 + Commit: fa73da0, github.com/apache/spark/pull/5582 + + [SPARK-6963][CORE]Flaky test: o.a.s.ContextCleanerSuite automatically cleanup checkpoint + GuoQiang Li + 2015-04-19 09:37:09 +0100 + Commit: 0424da6, github.com/apache/spark/pull/5548 + + SPARK-6993 : Add default min, max methods for JavaDoubleRDD + Olivier Girardot + 2015-04-18 18:21:44 -0700 + Commit: 8fbd45c, github.com/apache/spark/pull/5571 + + Fixed doc + Gaurav Nanda + 2015-04-18 17:20:46 -0700 + Commit: 729885e, github.com/apache/spark/pull/5576 + + [SPARK-6219] Reuse pep8.py + Nicholas Chammas + 2015-04-18 16:46:28 -0700 + Commit: 28683b4, github.com/apache/spark/pull/5561 + + [core] [minor] Make sure ConnectionManager stops. + Marcelo Vanzin + 2015-04-18 10:14:56 +0100 + Commit: 327ebf0, github.com/apache/spark/pull/5566 + + SPARK-6992 : Fix documentation example for Spark SQL on StructType + Olivier Girardot + 2015-04-18 00:31:01 -0700 + Commit: 5f095d5, github.com/apache/spark/pull/5569 + + [SPARK-6975][Yarn] Fix argument validation error + jerryshao + 2015-04-17 19:17:06 -0700 + Commit: d850b4b, github.com/apache/spark/pull/5551 + + [SPARK-5933] [core] Move config deprecation warnings to SparkConf. + Marcelo Vanzin + 2015-04-17 19:02:07 -0700 + Commit: 1991337, github.com/apache/spark/pull/5562 + + [SPARK-6350][Mesos] Make mesosExecutorCores configurable in mesos "fine-grained" mode + Jongyoul Lee + 2015-04-17 18:30:55 -0700 + Commit: 6fbeb82, github.com/apache/spark/pull/5063 + + [SPARK-6703][Core] Provide a way to discover existing SparkContext's + Ilya Ganelin + 2015-04-17 18:28:42 -0700 + Commit: c5ed510, github.com/apache/spark/pull/5501 + + Minor fix to SPARK-6958: Improve Python docstring for DataFrame.sort. + Reynold Xin + 2015-04-17 16:30:13 -0500 + Commit: a452c59, github.com/apache/spark/pull/5558 + + SPARK-6988 : Fix documentation regarding DataFrames using the Java API + Olivier Girardot + 2015-04-17 16:23:10 -0500 + Commit: d305e68, github.com/apache/spark/pull/5564 + + [SPARK-6807] [SparkR] Merge recent SparkR-pkg changes + cafreeman , Davies Liu , Zongheng Yang , Shivaram Venkataraman , Shivaram Venkataraman , Sun Rui + 2015-04-17 13:42:19 -0700 + Commit: 59e206d, github.com/apache/spark/pull/5436 + + [SPARK-6113] [ml] Stabilize DecisionTree API + Joseph K. Bradley + 2015-04-17 13:15:36 -0700 + Commit: a83571a, github.com/apache/spark/pull/5530 + + [SPARK-2669] [yarn] Distribute client configuration to AM. + Marcelo Vanzin + 2015-04-17 14:21:51 -0500 + Commit: 50ab8a6, github.com/apache/spark/pull/4142 + + [SPARK-6957] [SPARK-6958] [SQL] improve API compatibility to pandas + Davies Liu + 2015-04-17 11:29:27 -0500 + Commit: c84d916, github.com/apache/spark/pull/5544 + + [SPARK-6604][PySpark]Specify ip of python server scoket + linweizhong + 2015-04-17 12:04:02 +0100 + Commit: dc48ba9, github.com/apache/spark/pull/5256 + + [SPARK-6952] Handle long args when detecting PID reuse + Punya Biswal + 2015-04-17 11:08:37 +0100 + Commit: f6a9a57, github.com/apache/spark/pull/5535 + + [SPARK-6046] [core] Reorganize deprecated config support in SparkConf. + Marcelo Vanzin + 2015-04-17 11:06:01 +0100 + Commit: 4527761, github.com/apache/spark/pull/5514 + + SPARK-6846 [WEBUI] Stage kill URL easy to accidentally trigger and possibility for security issue + Sean Owen + 2015-04-17 11:02:31 +0100 + Commit: f7a2564, github.com/apache/spark/pull/5528 + + [SPARK-6972][SQL] Add Coalesce to DataFrame + Michael Armbrust + 2015-04-16 21:49:26 -0500 + Commit: 8220d52, github.com/apache/spark/pull/5545 + + [SPARK-6966][SQL] Use correct ClassLoader for JDBC Driver + Michael Armbrust + 2015-04-16 17:59:49 -0700 + Commit: e5949c2, github.com/apache/spark/pull/5543 + + [SPARK-6899][SQL] Fix type mismatch when using codegen with Average on DecimalType + Liang-Chi Hsieh + 2015-04-16 17:50:20 -0700 + Commit: 1e43851, github.com/apache/spark/pull/5517 + + [SQL][Minor] Fix foreachUp of treenode + scwf , Fei Wang + 2015-04-16 17:35:51 -0700 + Commit: d966086, github.com/apache/spark/pull/5518 + + [SPARK-6911] [SQL] improve accessor for nested types + Davies Liu + 2015-04-16 17:33:57 -0700 + Commit: 6183b5e, github.com/apache/spark/pull/5513 + + SPARK-6927 [SQL] Sorting Error when codegen on + 云峤 + 2015-04-16 17:32:42 -0700 + Commit: 5fe4343, github.com/apache/spark/pull/5524 + + [SPARK-4897] [PySpark] Python 3 support + Davies Liu , twneale , Josh Rosen + 2015-04-16 16:20:57 -0700 + Commit: 04e44b3, github.com/apache/spark/pull/5173 + + [SPARK-6855] [SPARKR] Set R includes to get the right collate order. + Shivaram Venkataraman + 2015-04-16 13:06:34 -0700 + Commit: 55f553a, github.com/apache/spark/pull/5462 + + [SPARK-6934][Core] Use 'spark.akka.askTimeout' for the ask timeout + zsxwing + 2015-04-16 13:45:55 -0500 + Commit: ef3fb80, github.com/apache/spark/pull/5529 + + [SPARK-6694][SQL]SparkSQL CLI must be able to specify an option --database on the command line. + Jin Adachi , adachij + 2015-04-16 23:41:04 +0800 + Commit: 3ae37b9, github.com/apache/spark/pull/5345 + + [SPARK-4194] [core] Make SparkContext initialization exception-safe. + Marcelo Vanzin + 2015-04-16 10:48:31 +0100 + Commit: de4fa6b, github.com/apache/spark/pull/5335 + + SPARK-4783 [CORE] System.exit() calls in SparkContext disrupt applications embedding Spark + Sean Owen + 2015-04-16 10:45:32 +0100 + Commit: 6179a94, github.com/apache/spark/pull/5492 + + [Streaming][minor] Remove additional quote and unneeded imports + jerryshao + 2015-04-16 10:39:02 +0100 + Commit: 8370550, github.com/apache/spark/pull/5540 + + [SPARK-6893][ML] default pipeline parameter handling in python + Xiangrui Meng + 2015-04-15 23:49:42 -0700 + Commit: 57cd1e8, github.com/apache/spark/pull/5534 + + SPARK-6938: All require statements now have an informative error message. + Juliet Hougland + 2015-04-15 21:52:25 -0700 + Commit: 52c3439, github.com/apache/spark/pull/5532 + + [SPARK-5277][SQL] - SparkSqlSerializer doesn't always register user specified KryoRegistrators + Max Seiden + 2015-04-15 16:15:11 -0700 + Commit: 8a53de1, github.com/apache/spark/pull/5237 + + [SPARK-2312] Logging Unhandled messages + Isaias Barroso + 2015-04-15 22:40:52 +0100 + Commit: d5f1b96, github.com/apache/spark/pull/2055 + + [SPARK-2213] [SQL] sort merge join for spark sql + Daoyuan Wang , Michael Armbrust + 2015-04-15 14:06:10 -0700 + Commit: 585638e, github.com/apache/spark/pull/5208 + + [SPARK-6898][SQL] completely support special chars in column names + Wenchen Fan + 2015-04-15 13:39:12 -0700 + Commit: 4754e16, github.com/apache/spark/pull/5511 + + [SPARK-6937][MLLIB] Fixed bug in PICExample in which the radius were not being accepted on c... + sboeschhuawei + 2015-04-15 13:28:10 -0700 + Commit: 557a797, github.com/apache/spark/pull/5531 + + [SPARK-6844][SQL] Clean up accumulators used in InMemoryRelation when it is uncached + Liang-Chi Hsieh + 2015-04-15 13:15:58 -0700 + Commit: cf38fe0, github.com/apache/spark/pull/5475 + + [SPARK-6638] [SQL] Improve performance of StringType in SQL + Davies Liu + 2015-04-15 13:06:38 -0700 + Commit: 8584276, github.com/apache/spark/pull/5350 + + [SPARK-6887][SQL] ColumnBuilder misses FloatType + Yin Huai + 2015-04-15 13:04:03 -0700 + Commit: 785f955, github.com/apache/spark/pull/5499 + + [SPARK-6800][SQL] Update doc for JDBCRelation's columnPartition + Liang-Chi Hsieh + 2015-04-15 13:01:29 -0700 + Commit: e3e4e9a, github.com/apache/spark/pull/5488 + + [SPARK-6730][SQL] Allow using keyword as identifier in OPTIONS + Liang-Chi Hsieh + 2015-04-15 13:00:19 -0700 + Commit: b75b307, github.com/apache/spark/pull/5520 + + [SPARK-6886] [PySpark] fix big closure with shuffle + Davies Liu + 2015-04-15 12:58:02 -0700 + Commit: f11288d, github.com/apache/spark/pull/5496 + + SPARK-6861 [BUILD] Scalastyle config prevents building Maven child modules alone + Sean Owen + 2015-04-15 15:17:58 +0100 + Commit: 6c5ed8a, github.com/apache/spark/pull/5471 + + [HOTFIX] [SPARK-6896] [SQL] fix compile error in hive-thriftserver + Daoyuan Wang + 2015-04-15 10:23:53 +0100 + Commit: 29aabdd, github.com/apache/spark/pull/5507 + + [SPARK-6871][SQL] WITH clause in CTE can not following another WITH clause + Liang-Chi Hsieh + 2015-04-14 23:47:16 -0700 + Commit: 6be9189, github.com/apache/spark/pull/5480 + + [SPARK-5634] [core] Show correct message in HS when no incomplete apps f... + Marcelo Vanzin + 2015-04-14 18:52:48 -0700 + Commit: 30a6e0d, github.com/apache/spark/pull/5515 + + [SPARK-6890] [core] Fix launcher lib work with SPARK_PREPEND_CLASSES. + Marcelo Vanzin + 2015-04-14 18:51:39 -0700 + Commit: 9717389, github.com/apache/spark/pull/5504 + + [SPARK-6796][Streaming][WebUI] Add "Active Batches" and "Completed Batches" lists to StreamingPage + zsxwing + 2015-04-14 16:51:36 -0700 + Commit: 6de282e, github.com/apache/spark/pull/5434 + + Revert "[SPARK-6352] [SQL] Add DirectParquetOutputCommitter" + Josh Rosen + 2015-04-14 14:07:25 -0700 + Commit: a76b921 + + [SPARK-6769][YARN][TEST] Usage of the ListenerBus in YarnClusterSuite is wrong + Kousuke Saruta + 2015-04-14 14:00:49 -0700 + Commit: 4d4b249, github.com/apache/spark/pull/5417 + + [SPARK-5808] [build] Package pyspark files in sbt assembly. + Marcelo Vanzin + 2015-04-14 13:41:38 -0700 + Commit: 6577437, github.com/apache/spark/pull/5461 + + [SPARK-6905] Upgrade to snappy-java 1.1.1.7 + Josh Rosen + 2015-04-14 13:40:07 -0700 + Commit: 6adb8bc, github.com/apache/spark/pull/5512 + + [SPARK-6700] [yarn] Re-enable flaky test. + Marcelo Vanzin + 2015-04-14 13:34:44 -0700 + Commit: b075e4b, github.com/apache/spark/pull/5459 + + SPARK-1706: Allow multiple executors per worker in Standalone mode + CodingCat + 2015-04-14 13:32:06 -0700 + Commit: 8f8dc45, github.com/apache/spark/pull/731 + + [SPARK-2033] Automatically cleanup checkpoint + GuoQiang Li + 2015-04-14 12:56:47 -0700 + Commit: 25998e4, github.com/apache/spark/pull/855 + + [CORE] SPARK-6880: Fixed null check when all the dependent stages are cancelled due to previous stage failure + pankaj arora + 2015-04-14 12:06:46 -0700 + Commit: dcf8a9f, github.com/apache/spark/pull/5494 + + [SPARK-6894]spark.executor.extraLibraryOptions => spark.executor.extraLibraryPath + WangTaoTheTonic + 2015-04-14 12:02:11 -0700 + Commit: f63b44a, github.com/apache/spark/pull/5506 + + [SPARK-6081] Support fetching http/https uris in driver runner. + Timothy Chen + 2015-04-14 11:48:12 -0700 + Commit: 320bca4, github.com/apache/spark/pull/4832 + + SPARK-6878 [CORE] Fix for sum on empty RDD fails with exception + Erik van Oosten + 2015-04-14 12:39:56 +0100 + Commit: 51b306b, github.com/apache/spark/pull/5489 + + [SPARK-6731] Bump version of apache commons-math3 + Punyashloka Biswal + 2015-04-14 11:43:06 +0100 + Commit: 628a72f, github.com/apache/spark/pull/5380 + + [WIP][HOTFIX][SPARK-4123]: Fix bug in PR dependency (all deps. removed issue) + Brennon York + 2015-04-13 22:31:44 -0700 + Commit: 77eeb10, github.com/apache/spark/pull/5443 + + [SPARK-5957][ML] better handling of parameters + Xiangrui Meng + 2015-04-13 21:18:05 -0700 + Commit: 971b95b, github.com/apache/spark/pull/5431 + + [Minor][SparkR] Minor refactor and removes redundancy related to cleanClosure. + hlin09 + 2015-04-13 20:43:24 -0700 + Commit: 0ba3fdd, github.com/apache/spark/pull/5495 + + [SPARK-5794] [SQL] fix add jar + Daoyuan Wang + 2015-04-13 18:26:00 -0700 + Commit: b45059d, github.com/apache/spark/pull/4586 + + [SQL] [Minor] Fix for SqlApp.scala + Fei Wang + 2015-04-13 18:23:35 -0700 + Commit: 3782e1f, github.com/apache/spark/pull/5485 + + [Spark-4848] Allow different Worker configurations in standalone cluster + Nathan Kronenfeld + 2015-04-13 18:21:16 -0700 + Commit: 435b877, github.com/apache/spark/pull/5140 + + [SPARK-6877][SQL] Add code generation support for Min + Liang-Chi Hsieh + 2015-04-13 18:16:33 -0700 + Commit: 4898dfa, github.com/apache/spark/pull/5487 + + [SPARK-6303][SQL] Remove unnecessary Average in GeneratedAggregate + Liang-Chi Hsieh + 2015-04-13 18:15:29 -0700 + Commit: 5b8b324, github.com/apache/spark/pull/4996 + + [SPARK-6881][SparkR] Changes the checkpoint directory name. + hlin09 + 2015-04-13 16:53:50 -0700 + Commit: d7f2c19, github.com/apache/spark/pull/5493 + + [SPARK-5931][CORE] Use consistent naming for time properties + Ilya Ganelin , Ilya Ganelin + 2015-04-13 16:28:07 -0700 + Commit: c4ab255, github.com/apache/spark/pull/5236 + + [SPARK-5941] [SQL] Unit Test loads the table `src` twice for leftsemijoin.q + Cheng Hao + 2015-04-13 16:02:18 -0700 + Commit: c5602bd, github.com/apache/spark/pull/4506 + + [SPARK-6872] [SQL] add copy in external sort + Daoyuan Wang + 2015-04-13 16:00:58 -0700 + Commit: e63a86a, github.com/apache/spark/pull/5481 + + [SPARK-5972] [MLlib] Cache residuals and gradient in GBT during training and validation + MechCoder + 2015-04-13 15:36:33 -0700 + Commit: 2a55cb4, github.com/apache/spark/pull/5330 + + [SQL][SPARK-6742]: Don't push down predicates which reference partition column(s) + Yash Datta + 2015-04-13 14:43:07 -0700 + Commit: 3a205bb, github.com/apache/spark/pull/5390 + + [SPARK-6130] [SQL] support if not exists for insert overwrite into partition in hiveQl + Daoyuan Wang + 2015-04-13 14:29:07 -0700 + Commit: 85ee0ca, github.com/apache/spark/pull/4865 + + [SPARK-5988][MLlib] add save/load for PowerIterationClusteringModel + Xusen Yin + 2015-04-13 11:53:17 -0700 + Commit: 1e340c3, github.com/apache/spark/pull/5450 + + [SPARK-6662][YARN] Allow variable substitution in spark.yarn.historyServer.address + Cheolsoo Park + 2015-04-13 13:45:10 -0500 + Commit: 6cc5b3e, github.com/apache/spark/pull/5321 + + [SPARK-6765] Enable scalastyle on test code. + Reynold Xin + 2015-04-13 09:29:04 -0700 + Commit: c5b0b29, github.com/apache/spark/pull/5486 + + [SPARK-6207] [YARN] [SQL] Adds delegation tokens for metastore to conf. + Doug Balog , Doug Balog + 2015-04-13 09:49:58 -0500 + Commit: 77620be, github.com/apache/spark/pull/5031 + + [SPARK-6352] [SQL] Add DirectParquetOutputCommitter + Pei-Lun Lee + 2015-04-13 21:52:00 +0800 + Commit: b29663e, github.com/apache/spark/pull/5042 + + [SPARK-6870][Yarn] Catch InterruptedException when yarn application state monitor thread been interrupted + linweizhong + 2015-04-13 13:06:54 +0100 + Commit: 202ebf0, github.com/apache/spark/pull/5479 + + [SPARK-6671] Add status command for spark daemons + Pradeep Chanumolu + 2015-04-13 13:02:55 +0100 + Commit: 240ea03, github.com/apache/spark/pull/5327 + + [SPARK-6440][CORE]Handle IPv6 addresses properly when constructing URI + nyaapa + 2015-04-13 12:55:25 +0100 + Commit: 9d117ce, github.com/apache/spark/pull/5424 + + [SPARK-6860][Streaming][WebUI] Fix the possible inconsistency of StreamingPage + zsxwing + 2015-04-13 12:21:29 +0100 + Commit: 14ce3ea, github.com/apache/spark/pull/5470 + + [SPARK-6762]Fix potential resource leaks in CheckPoint CheckpointWriter and CheckpointReader + lisurprise + 2015-04-13 12:18:05 +0100 + Commit: cadd7d7, github.com/apache/spark/pull/5407 + + [SPARK-6868][YARN] Fix broken container log link on executor page when HTTPS_ONLY. + Dean Chen + 2015-04-13 12:08:55 +0100 + Commit: 950645d, github.com/apache/spark/pull/5477 + + [SPARK-6562][SQL] DataFrame.replace + Reynold Xin + 2015-04-12 22:56:12 -0700 + Commit: 68d1faa, github.com/apache/spark/pull/5282 + + [SPARK-5885][MLLIB] Add VectorAssembler as a feature transformer + Xiangrui Meng + 2015-04-12 22:42:01 -0700 + Commit: 9294044, github.com/apache/spark/pull/5196 + + [SPARK-5886][ML] Add StringIndexer as a feature transformer + Xiangrui Meng + 2015-04-12 22:41:05 -0700 + Commit: 685ddcf, github.com/apache/spark/pull/4735 + + [SPARK-4081] [mllib] VectorIndexer + Joseph K. Bradley + 2015-04-12 22:38:27 -0700 + Commit: d3792f5, github.com/apache/spark/pull/3000 + + [SPARK-6643][MLLIB] Implement StandardScalerModel missing methods + lewuathe + 2015-04-12 22:17:16 -0700 + Commit: fc17661, github.com/apache/spark/pull/5310 + + [SPARK-6765] Fix test code style for core. + Reynold Xin + 2015-04-12 20:50:49 -0700 + Commit: a1fe59d, github.com/apache/spark/pull/5484 + + [MINOR] a typo: coalesce + Daoyuan Wang + 2015-04-12 18:58:53 +0100 + Commit: 04bcd67, github.com/apache/spark/pull/5482 + + [SPARK-6431][Streaming][Kafka] Error message for partition metadata requ... + cody koeninger + 2015-04-12 17:37:30 +0100 + Commit: 6ac8eea, github.com/apache/spark/pull/5454 + + [SPARK-6843][core]Add volatile for the "state" + lisurprise + 2015-04-12 13:41:44 +0100 + Commit: ddc1743, github.com/apache/spark/pull/5448 + + [SPARK-6866][Build] Remove duplicated dependency in launcher/pom.xml + Guancheng (G.C.) Chen + 2015-04-12 11:36:41 +0100 + Commit: e9445b1, github.com/apache/spark/pull/5476 + + [SPARK-6677] [SQL] [PySpark] fix cached classes + Davies Liu + 2015-04-11 22:33:23 -0700 + Commit: 5d8f7b9, github.com/apache/spark/pull/5445 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-04-11 22:12:56 -0700 + Commit: 0cc8fcb, github.com/apache/spark/pull/4994 + + SPARK-6710 GraphX Fixed Wrong initial bias in GraphX SVDPlusPlus + Michael Malak + 2015-04-11 21:01:23 -0700 + Commit: 1205f7e, github.com/apache/spark/pull/5464 + + [HOTFIX] Add explicit return types to fix lint errors + Josh Rosen + 2015-04-11 20:12:40 -0700 + Commit: dea5dac + + [SQL][minor] move `resolveGetField` into a object + Wenchen Fan + 2015-04-11 19:35:56 -0700 + Commit: 5c2844c, github.com/apache/spark/pull/5435 + + [SPARK-6367][SQL] Use the proper data type for those expressions that are hijacking existing data types. + Yin Huai + 2015-04-11 19:26:15 -0700 + Commit: 6d4e854, github.com/apache/spark/pull/5094 + + [SQL] Handle special characters in the authority of a Path's URI. + Yin Huai + 2015-04-11 18:44:54 -0700 + Commit: d2383fb, github.com/apache/spark/pull/5381 + + [SPARK-6379][SQL] Support a functon to call user-defined functions registered in SQLContext + Takeshi YAMAMURO + 2015-04-11 18:41:12 -0700 + Commit: 352a5da, github.com/apache/spark/pull/5061 + + [SPARK-6179][SQL] Add token for "SHOW PRINCIPALS role_name" and "SHOW TRANSACTIONS" and "SHOW COMPACTIONS" + DoingDone9 <799203320@qq.com>, Zhongshuai Pei <799203320@qq.com>, Xu Tingjun + 2015-04-11 18:34:17 -0700 + Commit: 48cc840, github.com/apache/spark/pull/4902 + + [Spark-5068][SQL]Fix bug query data when path doesn't exist for HiveContext + lazymam500 , lazyman + 2015-04-11 18:33:14 -0700 + Commit: 1f39a61, github.com/apache/spark/pull/5059 + + [SPARK-6199] [SQL] Support CTE in HiveContext and SQLContext + haiyang + 2015-04-11 18:30:17 -0700 + Commit: 2f53588, github.com/apache/spark/pull/4929 + + [Minor][SQL] Fix typo in sql + Guancheng (G.C.) Chen + 2015-04-11 15:43:12 -0700 + Commit: 7dbd371, github.com/apache/spark/pull/5474 + + [SPARK-6863] Fix formatting on SQL programming guide. + Santiago M. Mola + 2015-04-11 15:42:03 -0700 + Commit: 6437e7c, github.com/apache/spark/pull/5472 + + [SPARK-6611][SQL] Add support for INTEGER as synonym of INT. + Santiago M. Mola + 2015-04-11 14:52:49 -0700 + Commit: 5f7b7cd, github.com/apache/spark/pull/5271 + + [SPARK-6858][SQL] Register Java HashMap for SparkSqlSerializer + Liang-Chi Hsieh + 2015-04-11 14:50:50 -0700 + Commit: 198cf2a, github.com/apache/spark/pull/5465 + + [SPARK-6835] [SQL] Fix bug of Hive UDTF in Lateral View (ClassNotFound) + Cheng Hao + 2015-04-11 22:11:03 +0800 + Commit: 3ceb810, github.com/apache/spark/pull/5444 + + [hotfix] [build] Make sure JAVA_HOME is set for tests. + Marcelo Vanzin + 2015-04-11 13:10:01 +0100 + Commit: 694aef0, github.com/apache/spark/pull/5441 + + [Minor][Core] Fix typo + Liang-Chi Hsieh + 2015-04-11 13:07:41 +0100 + Commit: 95a0759, github.com/apache/spark/pull/5466 + + [SQL] [SPARK-6620] Speed up toDF() and rdd() functions by constructing converters in ScalaReflection + Volodymyr Lyubinets + 2015-04-10 16:27:56 -0700 + Commit: 67d0688, github.com/apache/spark/pull/5279 + + [SPARK-6851][SQL] Create new instance for each converted parquet relation + Michael Armbrust + 2015-04-10 16:05:14 -0700 + Commit: 23d5f88, github.com/apache/spark/pull/5458 + + [SPARK-6850] [SparkR] use one partition when we need to compare the whole result + Davies Liu + 2015-04-10 15:35:45 -0700 + Commit: 68ecdb7, github.com/apache/spark/pull/5460 + + [SPARK-6216] [PySpark] check the python version in worker + Davies Liu + 2015-04-10 14:04:53 -0700 + Commit: 4740d6a, github.com/apache/spark/pull/5404 + + [SPARK-5969][PySpark] Fix descending pyspark.rdd.sortByKey. + Milan Straka + 2015-04-10 13:50:32 -0700 + Commit: 0375134, github.com/apache/spark/pull/4761 + + [SQL] [SPARK-6794] Use kryo-based SparkSqlSerializer for GeneralHashedRelation + Volodymyr Lyubinets + 2015-04-10 12:09:54 -0700 + Commit: b9baa4c, github.com/apache/spark/pull/5433 + + [SPARK-6773][Tests]Fix RAT checks still passed issue when download rat jar failed + June.He + 2015-04-10 20:02:35 +0100 + Commit: 9f5ed99, github.com/apache/spark/pull/5421 + + [SPARK-6766][Streaming] Fix issue about StreamingListenerBatchSubmitted and StreamingListenerBatchStarted + zsxwing + 2015-04-10 01:51:42 -0700 + Commit: 18ca089, github.com/apache/spark/pull/5414 + + [SPARK-6211][Streaming] Add Python Kafka API unit test + jerryshao , Saisai Shao + 2015-04-09 23:14:24 -0700 + Commit: 3290d2d, github.com/apache/spark/pull/4961 + + [SPARK-6577] [MLlib] [PySpark] SparseMatrix should be supported in PySpark + MechCoder + 2015-04-09 23:10:13 -0700 + Commit: e236081, github.com/apache/spark/pull/5355 + + [SPARK-3074] [PySpark] support groupByKey() with single huge key + Davies Liu , Davies Liu + 2015-04-09 17:07:23 -0700 + Commit: b5c51c8, github.com/apache/spark/pull/1977 + + [Spark-6693][MLlib]add tostring with max lines and width for matrix + Yuhao Yang + 2015-04-09 15:37:45 -0700 + Commit: 9c67049, github.com/apache/spark/pull/5344 + + [SPARK-6264] [MLLIB] Support FPGrowth algorithm in Python API + Yanbo Liang + 2015-04-09 15:10:10 -0700 + Commit: a0411ae, github.com/apache/spark/pull/5213 + + [SPARK-6758]block the right jetty package in log + WangTaoTheTonic + 2015-04-09 17:44:08 -0400 + Commit: 7d92db3, github.com/apache/spark/pull/5406 + + [minor] [examples] Avoid packaging duplicate classes. + Marcelo Vanzin + 2015-04-09 07:07:50 -0400 + Commit: 470d745, github.com/apache/spark/pull/5379 + + SPARK-4924 addendum. Minor assembly directory fix in load-spark-env-sh + raschild + 2015-04-09 07:04:18 -0400 + Commit: 53f6bb1, github.com/apache/spark/pull/5261 + + [SPARK-6343] Doc driver-worker network reqs + Peter Parente + 2015-04-09 06:37:20 -0400 + Commit: b9c51c0, github.com/apache/spark/pull/5382 + + [SPARK-5654] Integrate SparkR + Shivaram Venkataraman , Shivaram Venkataraman , Zongheng Yang , cafreeman , Shivaram Venkataraman , Davies Liu , Davies Liu , hlin09 , Sun Rui , lythesia , oscaroboto , Antonio Piccolboni , root , edwardt , hqzizania , dputler , Todd Gao , Chris Freeman , Felix Cheung , Hossein , Evert Lammerts , Felix Cheung , felixcheung , Ryan Hafen , Ashutosh Raina , Oscar Olmedo , Josh Rosen , Yi Lu , Harihar Nahak + 2015-04-08 22:45:40 -0700 + Commit: 2fe0a1a, github.com/apache/spark/pull/5096 + + [SPARK-6765] Fix test code style for SQL + Reynold Xin + 2015-04-08 20:35:29 -0700 + Commit: 1b2aab8, github.com/apache/spark/pull/5412 + + [SPARK-6696] [SQL] Adds HiveContext.refreshTable to PySpark + Cheng Lian + 2015-04-08 18:47:39 -0700 + Commit: 891ada5, github.com/apache/spark/pull/5349 + + [SPARK-6451][SQL] supported code generation for CombineSum + Venkata Ramana Gollamudi + 2015-04-08 18:42:34 -0700 + Commit: 7d7384c, github.com/apache/spark/pull/5138 + + [SQL][minor] remove duplicated resolveGetField and update comment + Wenchen Fan + 2015-04-08 13:57:01 -0700 + Commit: 9418280, github.com/apache/spark/pull/5304 + + [SPARK-4346][SPARK-3596][YARN] Commonize the monitor logic + unknown , Sephiroth-Lin + 2015-04-08 13:56:42 -0700 + Commit: 55a92ef, github.com/apache/spark/pull/5305 + + [SPARK-5242]: Add --private-ips flag to EC2 script + Michelangelo D'Agostino + 2015-04-08 16:48:45 -0400 + Commit: 86403f5, github.com/apache/spark/pull/5244 + + [SPARK-6767][SQL] Fixed Query DSL error in spark sql Readme + Tijo Thomas + 2015-04-08 13:42:29 -0700 + Commit: 2f482d7, github.com/apache/spark/pull/5415 + + [SPARK-6781] [SQL] use sqlContext in python shell + Davies Liu + 2015-04-08 13:31:45 -0700 + Commit: 6ada4f6, github.com/apache/spark/pull/5425 + + [SPARK-6765] Fix test code style for mllib. + Reynold Xin + 2015-04-08 11:32:44 -0700 + Commit: 66159c3, github.com/apache/spark/pull/5411 + + [SPARK-6765] Fix test code style for graphx. + Reynold Xin + 2015-04-08 11:31:48 -0700 + Commit: 8d812f9, github.com/apache/spark/pull/5410 + + [SPARK-6753] Clone SparkConf in ShuffleSuite tests + Kay Ousterhout + 2015-04-08 10:26:45 -0700 + Commit: 9d44ddc, github.com/apache/spark/pull/5401 + + [SPARK-6506] [pyspark] Do not try to retrieve SPARK_HOME when not needed... + Marcelo Vanzin + 2015-04-08 10:14:52 -0700 + Commit: f7e21dd, github.com/apache/spark/pull/5405 + + [SPARK-6765] Fix test code style for streaming. + Reynold Xin + 2015-04-08 00:24:59 -0700 + Commit: 15e0d2b, github.com/apache/spark/pull/5409 + + [SPARK-6754] Remove unnecessary TaskContextHelper + Kay Ousterhout + 2015-04-07 22:40:42 -0700 + Commit: 8d2a36c, github.com/apache/spark/pull/5402 + + [SPARK-6705][MLLIB] Add fit intercept api to ml logisticregression + Omede Firouz + 2015-04-07 23:36:31 -0400 + Commit: d138aa8, github.com/apache/spark/pull/5301 + + [SPARK-6737] Fix memory leak in OutputCommitCoordinator + Josh Rosen + 2015-04-07 16:18:55 -0700 + Commit: c83e039, github.com/apache/spark/pull/5397 + + [SPARK-6748] [SQL] Makes QueryPlan.schema a lazy val + Cheng Lian + 2015-04-08 07:00:56 +0800 + Commit: 77bcceb, github.com/apache/spark/pull/5398 + + [SPARK-6720][MLLIB] PySpark MultivariateStatisticalSummary unit test for normL1... + lewuathe + 2015-04-07 14:36:57 -0700 + Commit: fc957dc, github.com/apache/spark/pull/5374 + + Revert "[SPARK-6568] spark-shell.cmd --jars option does not accept the jar that has space in its path" + Xiangrui Meng + 2015-04-07 14:34:15 -0700 + Commit: e6f08fb + + [SPARK-6568] spark-shell.cmd --jars option does not accept the jar that has space in its path + Masayoshi TSUZUKI + 2015-04-07 14:29:53 -0700 + Commit: 596ba77, github.com/apache/spark/pull/5347 + + [SPARK-6750] Upgrade ScalaStyle to 0.7. + Reynold Xin + 2015-04-07 12:37:33 -0700 + Commit: 1232215, github.com/apache/spark/pull/5399 + + Replace use of .size with .length for Arrays + sksamuel + 2015-04-07 10:43:22 -0700 + Commit: 2c32bef, github.com/apache/spark/pull/5376 + + [SPARK-6733][ Scheduler]Added scala.language.existentials + Vinod K C + 2015-04-07 10:42:08 -0700 + Commit: 7162ecf, github.com/apache/spark/pull/5384 + + [SPARK-3591][YARN]fire and forget for YARN cluster mode + WangTaoTheTonic + 2015-04-07 08:36:25 -0500 + Commit: b65bad6, github.com/apache/spark/pull/5297 + + [SPARK-6736][GraphX][Doc]Example of Graph#aggregateMessages has error + Sasaki Toru + 2015-04-07 01:55:32 -0700 + Commit: ae980eb, github.com/apache/spark/pull/5388 + + [SPARK-6636] Use public DNS hostname everywhere in spark_ec2.py + Matt Aasted + 2015-04-06 23:50:48 -0700 + Commit: 6f0d55d, github.com/apache/spark/pull/5302 + + [SPARK-6716] Change SparkContext.DRIVER_IDENTIFIER from to driver + Josh Rosen + 2015-04-06 23:33:16 -0700 + Commit: a0846c4, github.com/apache/spark/pull/5372 + + [Minor] [SQL] [SPARK-6729] Minor fix for DriverQuirks get + Volodymyr Lyubinets + 2015-04-06 18:00:51 -0700 + Commit: e40ea87, github.com/apache/spark/pull/5378 + + [MLlib] [SPARK-6713] Iterators in columnSimilarities for mapPartitionsWithIndex + Reza Zadeh + 2015-04-06 13:15:01 -0700 + Commit: 30363ed, github.com/apache/spark/pull/5364 + + SPARK-6569 [STREAMING] Down-grade same-offset message in Kafka streaming to INFO + Sean Owen + 2015-04-06 10:18:56 +0100 + Commit: 9fe4125, github.com/apache/spark/pull/5366 + + [SPARK-6673] spark-shell.cmd can't start in Windows even when spark was built + Masayoshi TSUZUKI + 2015-04-06 10:11:20 +0100 + Commit: 49f3882, github.com/apache/spark/pull/5328 + + [SPARK-6602][Core] Update MapOutputTrackerMasterActor to MapOutputTrackerMasterEndpoint + zsxwing + 2015-04-05 21:57:15 -0700 + Commit: 0b5d028, github.com/apache/spark/pull/5371 + + [SPARK-6262][MLLIB]Implement missing methods for MultivariateStatisticalSummary + lewuathe + 2015-04-05 16:13:31 -0700 + Commit: acffc43, github.com/apache/spark/pull/5359 + + [SPARK-6602][Core] Replace direct use of Akka with Spark RPC interface - part 1 + zsxwing + 2015-04-04 11:52:05 -0700 + Commit: f15806a, github.com/apache/spark/pull/5268 + + [SPARK-6607][SQL] Check invalid characters for Parquet schema and show error messages + Liang-Chi Hsieh + 2015-04-05 00:20:43 +0800 + Commit: 7bca62f, github.com/apache/spark/pull/5263 + + [SQL] Use path.makeQualified in newParquet. + Yin Huai + 2015-04-04 23:26:10 +0800 + Commit: da25c86, github.com/apache/spark/pull/5353 + + [SPARK-6700] disable flaky test + Davies Liu + 2015-04-03 15:22:21 -0700 + Commit: 9b40c17, github.com/apache/spark/pull/5356 + + [SPARK-6647][SQL] Make trait StringComparison as BinaryPredicate and fix unit tests of string data source Filter + Liang-Chi Hsieh + 2015-04-03 12:35:00 -0700 + Commit: 26b415e, github.com/apache/spark/pull/5309 + + [SPARK-6688] [core] Always use resolved URIs in EventLoggingListener. + Marcelo Vanzin + 2015-04-03 11:54:31 -0700 + Commit: 14632b7, github.com/apache/spark/pull/5340 + + Closes #3158 + Reynold Xin + 2015-04-03 11:53:07 -0700 + Commit: ffe8cc9 + + [SPARK-6640][Core] Fix the race condition of creating HeartbeatReceiver and retrieving HeartbeatReceiver + zsxwing + 2015-04-03 11:44:27 -0700 + Commit: 88504b7, github.com/apache/spark/pull/5306 + + [SPARK-6492][CORE] SparkContext.stop() can deadlock when DAGSchedulerEventProcessLoop dies + Ilya Ganelin + 2015-04-03 19:23:11 +0100 + Commit: 2c43ea3, github.com/apache/spark/pull/5277 + + [SPARK-5203][SQL] fix union with different decimal type + guowei2 + 2015-04-04 02:02:30 +0800 + Commit: c23ba81, github.com/apache/spark/pull/4004 + + [Minor][SQL] Fix typo + Liang-Chi Hsieh + 2015-04-03 18:31:48 +0100 + Commit: dc6dff2, github.com/apache/spark/pull/5352 + + [SPARK-6615][MLLIB] Python API for Word2Vec + lewuathe + 2015-04-03 09:49:50 -0700 + Commit: 512a2f1, github.com/apache/spark/pull/5296 + + [MLLIB] Remove println in LogisticRegression.scala + Omede Firouz + 2015-04-03 10:26:43 +0100 + Commit: b52c7f9, github.com/apache/spark/pull/5338 + + [SPARK-6560][CORE] Do not suppress exceptions from writer.write. + Stephen Haberman + 2015-04-03 09:48:37 +0100 + Commit: b0d884f, github.com/apache/spark/pull/5223 + + [SPARK-6428] Turn on explicit type checking for public methods. + Reynold Xin + 2015-04-03 01:25:02 -0700 + Commit: 82701ee, github.com/apache/spark/pull/5342 + + [SPARK-6575][SQL] Converted Parquet Metastore tables no longer cache metadata + Yin Huai + 2015-04-03 14:40:36 +0800 + Commit: c42c3fc, github.com/apache/spark/pull/5339 + + [SPARK-6621][Core] Fix the bug that calling EventLoop.stop in EventLoop.onReceive/onError/onStart doesn't call onStop + zsxwing + 2015-04-02 22:54:30 -0700 + Commit: 440ea31, github.com/apache/spark/pull/5280 + + [SPARK-6345][STREAMING][MLLIB] Fix for training with prediction + freeman + 2015-04-02 21:37:44 -0700 + Commit: 6e1c1ec, github.com/apache/spark/pull/5037 + + [CORE] The descriptionof jobHistory config should be spark.history.fs.logDirectory + KaiXinXiaoLei + 2015-04-02 20:24:31 -0700 + Commit: 8a0aa81, github.com/apache/spark/pull/5332 + + [SPARK-6575][SQL] Converted Parquet Metastore tables no longer cache metadata + Yin Huai + 2015-04-02 20:23:08 -0700 + Commit: 4b82bd7, github.com/apache/spark/pull/5339 + + [SPARK-6650] [core] Stop ExecutorAllocationManager when context stops. + Marcelo Vanzin + 2015-04-02 19:48:55 -0700 + Commit: 45134ec, github.com/apache/spark/pull/5311 + + [SPARK-6686][SQL] Use resolved output instead of names for toDF rename + Michael Armbrust + 2015-04-02 18:30:55 -0700 + Commit: 052dee0, github.com/apache/spark/pull/5337 + + [SPARK-6243][SQL] The Operation of match did not conside the scenarios that order.dataType does not match NativeType + DoingDone9 <799203320@qq.com> + 2015-04-02 17:23:51 -0700 + Commit: 947802c, github.com/apache/spark/pull/4959 + + [SQL][Minor] Use analyzed logical instead of unresolved in HiveComparisonTest + Cheng Hao + 2015-04-02 17:20:31 -0700 + Commit: dfd2982, github.com/apache/spark/pull/4946 + + [SPARK-6618][SPARK-6669][SQL] Lock Hive metastore client correctly. + Yin Huai , Michael Armbrust + 2015-04-02 16:46:50 -0700 + Commit: 5db8912, github.com/apache/spark/pull/5333 + + [Minor] [SQL] Follow-up of PR #5210 + Cheng Lian + 2015-04-02 16:15:34 -0700 + Commit: d3944b6, github.com/apache/spark/pull/5219 + + [SPARK-6655][SQL] We need to read the schema of a data source table stored in spark.sql.sources.schema property + Yin Huai + 2015-04-02 16:02:31 -0700 + Commit: 251698f, github.com/apache/spark/pull/5313 + + [SQL] Throw UnsupportedOperationException instead of NotImplementedError + Michael Armbrust + 2015-04-02 16:01:03 -0700 + Commit: 4214e50, github.com/apache/spark/pull/5315 + + SPARK-6414: Spark driver failed with NPE on job cancelation + Hung Lin + 2015-04-02 14:01:43 -0700 + Commit: e3202aa, github.com/apache/spark/pull/5124 + + [SPARK-6667] [PySpark] remove setReuseAddress + Davies Liu + 2015-04-02 12:18:33 -0700 + Commit: 0cce545, github.com/apache/spark/pull/5324 + + [SPARK-6672][SQL] convert row to catalyst in createDataFrame(RDD[Row], ...) + Xiangrui Meng + 2015-04-02 17:57:01 +0800 + Commit: 424e987, github.com/apache/spark/pull/5329 + + [SPARK-6627] Some clean-up in shuffle code. + Patrick Wendell + 2015-04-01 23:42:09 -0700 + Commit: 6562787, github.com/apache/spark/pull/5286 + + [SPARK-6663] [SQL] use Literal.create instread of constructor + Davies Liu + 2015-04-01 23:11:38 -0700 + Commit: 40df5d4, github.com/apache/spark/pull/5320 + + Revert "[SPARK-6618][SQL] HiveMetastoreCatalog.lookupRelation should use fine-grained lock" + Cheng Lian + 2015-04-02 12:56:34 +0800 + Commit: 2bc7fe7 + + [SPARK-6658][SQL] Update DataFrame documentation to fix type references. + Chet Mancini + 2015-04-01 21:39:46 -0700 + Commit: 191524e, github.com/apache/spark/pull/5316 + + [SPARK-6578] Small rewrite to make the logic more clear in MessageWithHeader.transferTo. + Reynold Xin + 2015-04-01 18:36:06 -0700 + Commit: 899ebcb, github.com/apache/spark/pull/5319 + + [SPARK-6660][MLLIB] pythonToJava doesn't recognize object arrays + Xiangrui Meng + 2015-04-01 18:17:07 -0700 + Commit: 4815bc2, github.com/apache/spark/pull/5318 + + [SPARK-6553] [pyspark] Support functools.partial as UDF + ksonj + 2015-04-01 17:23:57 -0700 + Commit: 757b2e9, github.com/apache/spark/pull/5206 + + [SPARK-6580] [MLLIB] Optimize LogisticRegressionModel.predictPoint + Yanbo Liang + 2015-04-01 17:19:36 -0700 + Commit: 86b4399, github.com/apache/spark/pull/5249 + + [SPARK-6576] [MLlib] [PySpark] DenseMatrix in PySpark should support indexing + MechCoder + 2015-04-01 17:03:39 -0700 + Commit: 2fa3b47, github.com/apache/spark/pull/5232 + + [SPARK-6642][MLLIB] use 1.2 lambda scaling and remove addImplicit from NormalEquation + Xiangrui Meng + 2015-04-01 16:47:18 -0700 + Commit: ccafd75, github.com/apache/spark/pull/5314 + + [SPARK-6578] [core] Fix thread-safety issue in outbound path of network library. + Marcelo Vanzin + 2015-04-01 16:06:11 -0700 + Commit: f084c5d, github.com/apache/spark/pull/5234 + + [SPARK-6657] [Python] [Docs] fixed python doc build warnings + Joseph K. Bradley + 2015-04-01 15:15:47 -0700 + Commit: fb25e8c, github.com/apache/spark/pull/5317 + + [SPARK-6651][MLLIB] delegate dense vector arithmetics to the underlying numpy array + Xiangrui Meng + 2015-04-01 13:29:04 -0700 + Commit: 2275acc, github.com/apache/spark/pull/5312 + + SPARK-6433 hive tests to import spark-sql test JAR for QueryTest access + Steve Loughran + 2015-04-01 16:26:54 +0100 + Commit: ee11be2, github.com/apache/spark/pull/5119 + + [SPARK-6608] [SQL] Makes DataFrame.rdd a lazy val + Cheng Lian + 2015-04-01 21:34:45 +0800 + Commit: d36c5fc, github.com/apache/spark/pull/5265 + + SPARK-6626 [DOCS]: Corrected Scala:TwitterUtils parameters + jayson + 2015-04-01 11:12:55 +0100 + Commit: 0358b08, github.com/apache/spark/pull/5295 + + [SPARK-6597][Minor] Replace `input:checkbox` with `input[type="checkbox"]` in additional-metrics.js + Kousuke Saruta + 2015-04-01 11:11:56 +0100 + Commit: d824c11, github.com/apache/spark/pull/5254 + + [EC2] [SPARK-6600] Open ports in ec2/spark_ec2.py to allow HDFS NFS gateway + Florian Verhein + 2015-04-01 11:10:43 +0100 + Commit: 4122623, github.com/apache/spark/pull/5257 + + [SPARK-4655][Core] Split Stage into ShuffleMapStage and ResultStage subclasses + Ilya Ganelin , Ilya Ganelin + 2015-04-01 11:09:00 +0100 + Commit: ff1915e, github.com/apache/spark/pull/4708 + + [Doc] Improve Python DataFrame documentation + Reynold Xin + 2015-03-31 18:31:36 -0700 + Commit: 305abe1, github.com/apache/spark/pull/5287 + + [SPARK-6614] OutputCommitCoordinator should clear authorized committer only after authorized committer fails, not after any failure + Josh Rosen + 2015-03-31 16:18:39 -0700 + Commit: 3732607, github.com/apache/spark/pull/5276 + + [SPARK-5692] [MLlib] Word2Vec save/load + MechCoder + 2015-03-31 16:01:08 -0700 + Commit: 0e00f12, github.com/apache/spark/pull/5291 + + [SPARK-6633][SQL] Should be "Contains" instead of "EndsWith" when constructing sources.StringContains + Liang-Chi Hsieh + 2015-03-31 13:18:07 -0700 + Commit: 2036bc5, github.com/apache/spark/pull/5299 + + [SPARK-5371][SQL] Propagate types after function conversion, before futher resolution + Michael Armbrust + 2015-03-31 11:34:29 -0700 + Commit: beebb7f, github.com/apache/spark/pull/5278 + + [SPARK-6255] [MLLIB] Support multiclass classification in Python API + Yanbo Liang + 2015-03-31 11:32:14 -0700 + Commit: b5bd75d, github.com/apache/spark/pull/5137 + + [SPARK-6598][MLLIB] Python API for IDFModel + lewuathe + 2015-03-31 11:25:21 -0700 + Commit: 46de6c0, github.com/apache/spark/pull/5264 + + [SPARK-6145][SQL] fix ORDER BY on nested fields + Michael Armbrust + 2015-03-31 11:23:18 -0700 + Commit: cd48ca5, github.com/apache/spark/pull/5189 + + [SPARK-6575] [SQL] Adds configuration to disable schema merging while converting metastore Parquet tables + Cheng Lian + 2015-03-31 11:21:15 -0700 + Commit: 8102014, github.com/apache/spark/pull/5231 + + [SPARK-6555] [SQL] Overrides equals() and hashCode() for MetastoreRelation + Cheng Lian + 2015-03-31 11:18:25 -0700 + Commit: a7992ff, github.com/apache/spark/pull/5289 + + [SPARK-4894][mllib] Added Bernoulli option to NaiveBayes model in mllib + leahmcguire , Joseph K. Bradley , Leah McGuire + 2015-03-31 11:16:55 -0700 + Commit: d01a6d8, github.com/apache/spark/pull/4087 + + [SPARK-6542][SQL] add CreateStruct + Xiangrui Meng + 2015-03-31 17:05:23 +0800 + Commit: a05835b, github.com/apache/spark/pull/5195 + + [SPARK-6618][SQL] HiveMetastoreCatalog.lookupRelation should use fine-grained lock + Yin Huai + 2015-03-31 16:28:40 +0800 + Commit: 314afd0, github.com/apache/spark/pull/5281 + + [SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Python. + Reynold Xin + 2015-03-31 00:25:23 -0700 + Commit: b80a030, github.com/apache/spark/pull/5284 + + [SPARK-6625][SQL] Add common string filters to data sources. + Reynold Xin + 2015-03-31 00:19:51 -0700 + Commit: f07e714, github.com/apache/spark/pull/5285 + + [SPARK-5124][Core] Move StopCoordinator to the receive method since it does not require a reply + zsxwing + 2015-03-30 22:10:49 -0700 + Commit: 5677557, github.com/apache/spark/pull/5283 + + [SPARK-6119][SQL] DataFrame support for missing data handling + Reynold Xin + 2015-03-30 20:47:10 -0700 + Commit: b8ff2bc, github.com/apache/spark/pull/5274 + + [SPARK-6369] [SQL] Uses commit coordinator to help committing Hive and Parquet tables + Cheng Lian + 2015-03-31 07:48:37 +0800 + Commit: fde6945, github.com/apache/spark/pull/5139 + + [SPARK-6603] [PySpark] [SQL] add SQLContext.udf and deprecate inferSchema() and applySchema + Davies Liu + 2015-03-30 15:47:00 -0700 + Commit: f76d2e5, github.com/apache/spark/pull/5273 + + [HOTFIX][SPARK-4123]: Updated to fix bug where multiple dependencies added breaks Github output + Brennon York + 2015-03-30 12:48:26 -0700 + Commit: df35500, github.com/apache/spark/pull/5269 + + [SPARK-6592][SQL] fix filter for scaladoc to generate API doc for Row class under catalyst dir + CodingCat + 2015-03-30 11:54:44 -0700 + Commit: 32259c6, github.com/apache/spark/pull/5252 + + [SPARK-6595][SQL] MetastoreRelation should be a MultiInstanceRelation + Michael Armbrust + 2015-03-30 22:24:12 +0800 + Commit: fe81f6c, github.com/apache/spark/pull/5251 + + [HOTFIX] Update start-slave.sh + Jose Manuel Gomez + 2015-03-30 14:59:08 +0100 + Commit: 19d4c39, github.com/apache/spark/pull/5262 + + [SPARK-5750][SPARK-3441][SPARK-5836][CORE] Added documentation explaining shuffle + Ilya Ganelin , Ilya Ganelin + 2015-03-30 11:52:02 +0100 + Commit: 4bdfb7b, github.com/apache/spark/pull/5074 + + [SPARK-6596] fix the instruction on building scaladoc + CodingCat + 2015-03-30 11:41:43 +0100 + Commit: de67330, github.com/apache/spark/pull/5253 + + [spark-sql] a better exception message than "scala.MatchError" for unsupported types in Schema creation + Eran Medan + 2015-03-30 00:02:52 -0700 + Commit: 17b13c5, github.com/apache/spark/pull/5235 + + Fix string interpolator error in HeartbeatReceiver + Li Zhihui + 2015-03-29 21:30:37 -0700 + Commit: 01dc9f5, github.com/apache/spark/pull/5255 + + [SPARK-5124][Core] A standard RPC interface and an Akka implementation + zsxwing + 2015-03-29 21:25:09 -0700 + Commit: a8d53af, github.com/apache/spark/pull/4588 + + [SPARK-6585][Tests]Fix FileServerSuite testcase in some Env. + June.He + 2015-03-29 12:47:22 +0100 + Commit: 0e2753f, github.com/apache/spark/pull/5239 + + [SPARK-6558] Utils.getCurrentUserName returns the full principal name instead of login name + Thomas Graves + 2015-03-29 12:43:30 +0100 + Commit: 52ece26, github.com/apache/spark/pull/5229 + + [SPARK-6406] Launch Spark using assembly jar instead of a separate launcher jar + Nishkam Ravi , nishkamravi2 , nravi + 2015-03-29 12:40:37 +0100 + Commit: e3eb393, github.com/apache/spark/pull/5085 + + [SPARK-4123][Project Infra]: Show new dependencies added in pull requests + Brennon York + 2015-03-29 12:37:53 +0100 + Commit: 55153f5, github.com/apache/spark/pull/5093 + + [DOC] Improvements to Python docs. + Reynold Xin + 2015-03-28 23:59:27 -0700 + Commit: 5eef00d, github.com/apache/spark/pull/5238 + + [SPARK-6571][MLLIB] use wrapper in MatrixFactorizationModel.load + Xiangrui Meng + 2015-03-28 15:08:05 -0700 + Commit: f75f633, github.com/apache/spark/pull/5243 + + [SPARK-6552][Deploy][Doc]expose start-slave.sh to user and update outdated doc + WangTaoTheTonic + 2015-03-28 12:32:35 +0000 + Commit: 9963143, github.com/apache/spark/pull/5205 + + [SPARK-6538][SQL] Add missing nullable Metastore fields when merging a Parquet schema + Adam Budde + 2015-03-28 09:14:09 +0800 + Commit: 5909f09, github.com/apache/spark/pull/5214 + + [SPARK-6564][SQL] SQLContext.emptyDataFrame should contain 0 row, not 1 row + Reynold Xin + 2015-03-27 14:56:57 -0700 + Commit: 3af7334, github.com/apache/spark/pull/5226 + + [SPARK-6526][ML] Add Normalizer transformer in ML package + Xusen Yin + 2015-03-27 13:29:10 -0700 + Commit: d5497ab, github.com/apache/spark/pull/5181 + + [SPARK-6574] [PySpark] fix sql example + Davies Liu + 2015-03-27 11:42:26 -0700 + Commit: 887e1b7, github.com/apache/spark/pull/5230 + + [SPARK-6550][SQL] Use analyzed plan in DataFrame + Michael Armbrust + 2015-03-27 11:40:00 -0700 + Commit: 5d9c37c, github.com/apache/spark/pull/5217 + + [SPARK-6544][build] Increment Avro version from 1.7.6 to 1.7.7 + Dean Chen + 2015-03-27 14:32:51 +0000 + Commit: aa2b991, github.com/apache/spark/pull/5193 + + [SPARK-6556][Core] Fix wrong parsing logic of executorTimeoutMs and checkTimeoutIntervalMs in HeartbeatReceiver + zsxwing + 2015-03-27 12:31:06 +0000 + Commit: da546b7, github.com/apache/spark/pull/5209 + + [SPARK-6341][mllib] Upgrade breeze from 0.11.1 to 0.11.2 + Yu ISHIKAWA + 2015-03-27 00:15:02 -0700 + Commit: f43a610, github.com/apache/spark/pull/5222 + + [SPARK-6405] Limiting the maximum Kryo buffer size to be 2GB. + mcheah + 2015-03-26 22:48:42 -0700 + Commit: 49d2ec6, github.com/apache/spark/pull/5218 + + [SPARK-6510][GraphX]: Add Graph#minus method to act as Set#difference + Brennon York + 2015-03-26 19:08:09 -0700 + Commit: 39fb579, github.com/apache/spark/pull/5175 + + [DOCS][SQL] Fix JDBC example + Michael Armbrust + 2015-03-26 14:51:46 -0700 + Commit: aad0032, github.com/apache/spark/pull/5192 + + [SPARK-6554] [SQL] Don't push down predicates which reference partition column(s) + Cheng Lian + 2015-03-26 13:11:37 -0700 + Commit: 71a0d40, github.com/apache/spark/pull/5210 + + [SPARK-6117] [SQL] Improvements to DataFrame.describe() + Reynold Xin + 2015-03-26 12:26:13 -0700 + Commit: 784fcd5, github.com/apache/spark/pull/5201 + + SPARK-6532 [BUILD] LDAModel.scala fails scalastyle on Windows + Sean Owen + 2015-03-26 10:52:31 -0700 + Commit: c3a52a0, github.com/apache/spark/pull/5211 + + SPARK-6480 [CORE] histogram() bucket function is wrong in some simple edge cases + Sean Owen + 2015-03-26 15:00:23 +0000 + Commit: fe15ea9, github.com/apache/spark/pull/5148 + + [MLlib]remove unused import + Yuhao Yang + 2015-03-26 13:27:05 +0000 + Commit: 3ddb975, github.com/apache/spark/pull/5207 + + [SQL][SPARK-6471]: Metastore schema should only be a subset of parquet schema to support dropping of columns using replace columns + Yash Datta + 2015-03-26 21:13:38 +0800 + Commit: 1c05027, github.com/apache/spark/pull/5141 + + [SPARK-6468][Block Manager] Fix the race condition of subDirs in DiskBlockManager + zsxwing + 2015-03-26 12:54:48 +0000 + Commit: 0c88ce5, github.com/apache/spark/pull/5136 + + [SPARK-6465][SQL] Fix serialization of GenericRowWithSchema using kryo + Michael Armbrust + 2015-03-26 18:46:57 +0800 + Commit: f88f51b, github.com/apache/spark/pull/5191 + + [SPARK-6546][Build] Using the wrong code that will make spark compile failed!! + DoingDone9 <799203320@qq.com> + 2015-03-26 17:04:19 +0800 + Commit: 855cba8, github.com/apache/spark/pull/5198 + + [SPARK-6117] [SQL] add describe function to DataFrame for summary statis... + azagrebin + 2015-03-26 00:25:04 -0700 + Commit: 5bbcd13, github.com/apache/spark/pull/5073 + + [SPARK-6536] [PySpark] Column.inSet() in Python + Davies Liu + 2015-03-26 00:01:24 -0700 + Commit: f535802, github.com/apache/spark/pull/5190 + + [SPARK-6463][SQL] AttributeSet.equal should compare size + sisihj , Michael Armbrust + 2015-03-25 19:21:54 -0700 + Commit: 276ef1c, github.com/apache/spark/pull/5194 + + The UT test of spark is failed. Because there is a test in SQLQuerySuite about creating table “test” + KaiXinXiaoLei + 2015-03-25 19:15:30 -0700 + Commit: e87bf37, github.com/apache/spark/pull/5150 + + [SPARK-6202] [SQL] enable variable substitution on test framework + Daoyuan Wang + 2015-03-25 18:43:26 -0700 + Commit: 5ab6e9f, github.com/apache/spark/pull/4930 + + [SPARK-6271][SQL] Sort these tokens in alphabetic order to avoid further duplicate in HiveQl + DoingDone9 <799203320@qq.com> + 2015-03-25 18:41:59 -0700 + Commit: 328daf6, github.com/apache/spark/pull/4973 + + [SPARK-6326][SQL] Improve castStruct to be faster + Liang-Chi Hsieh + 2015-03-25 17:52:23 -0700 + Commit: 73d5775, github.com/apache/spark/pull/5017 + + [SPARK-5498][SQL]fix query exception when partition schema does not match table schema + jeanlyn + 2015-03-25 17:47:45 -0700 + Commit: e6d1406, github.com/apache/spark/pull/4289 + + [SPARK-6450] [SQL] Fixes metastore Parquet table conversion + Cheng Lian + 2015-03-25 17:40:19 -0700 + Commit: 8c3b005, github.com/apache/spark/pull/5183 + + [SPARK-6079] Use index to speed up StatusTracker.getJobIdsForGroup() + Josh Rosen + 2015-03-25 17:40:00 -0700 + Commit: d44a336, github.com/apache/spark/pull/4830 + + [SPARK-5987] [MLlib] Save/load for GaussianMixtureModels + MechCoder + 2015-03-25 14:45:23 -0700 + Commit: 4fc4d03, github.com/apache/spark/pull/4986 + + [SPARK-6256] [MLlib] MLlib Python API parity check for regression + Yanbo Liang + 2015-03-25 13:38:33 -0700 + Commit: 4353373, github.com/apache/spark/pull/4997 + + [SPARK-5771] Master UI inconsistently displays application cores + Andrew Or + 2015-03-25 13:28:32 -0700 + Commit: c1b74df, github.com/apache/spark/pull/5177 + + [SPARK-6537] UIWorkloadGenerator: The main thread should not stop SparkContext until all jobs finish + Kousuke Saruta + 2015-03-25 13:27:15 -0700 + Commit: acef51d, github.com/apache/spark/pull/5187 + + [SPARK-6076][Block Manager] Fix a potential OOM issue when StorageLevel is MEMORY_AND_DISK_SER + zsxwing + 2015-03-25 12:09:30 -0700 + Commit: 883b7e9, github.com/apache/spark/pull/4827 + + [SPARK-6409][SQL] It is not necessary that avoid old inteface of hive, because this will make some UDAF can not work. + DoingDone9 <799203320@qq.com> + 2015-03-25 11:11:52 -0700 + Commit: 968408b, github.com/apache/spark/pull/5131 + + [ML][FEATURE] SPARK-5566: RegEx Tokenizer + Augustin Borsu , Augustin Borsu , Augustin Borsu , Xiangrui Meng + 2015-03-25 10:16:39 -0700 + Commit: 982952f, github.com/apache/spark/pull/4504 + + [SPARK-6496] [MLLIB] GeneralizedLinearAlgorithm.run(input, initialWeights) should initialize numFeatures + Yanbo Liang + 2015-03-25 17:05:56 +0000 + Commit: 10c7860, github.com/apache/spark/pull/5167 + + [SPARK-6483][SQL]Improve ScalaUdf called performance. + zzcclp + 2015-03-25 19:11:04 +0800 + Commit: 64262ed, github.com/apache/spark/pull/5154 + + [DOCUMENTATION]Fixed Missing Type Import in Documentation + Bill Chambers , anabranch + 2015-03-24 22:24:35 -0700 + Commit: c5cc414, github.com/apache/spark/pull/5179 + + [SPARK-6515] update OpenHashSet impl + Xiangrui Meng + 2015-03-24 18:58:27 -0700 + Commit: c14ddd9, github.com/apache/spark/pull/5176 + + [SPARK-6428][Streaming] Added explicit types for all public methods. + Reynold Xin + 2015-03-24 17:08:25 -0700 + Commit: 9459865, github.com/apache/spark/pull/5110 + + [SPARK-6512] add contains to OpenHashMap + Xiangrui Meng + 2015-03-24 17:06:22 -0700 + Commit: 6930e96, github.com/apache/spark/pull/5171 + + [SPARK-6469] Improving documentation on YARN local directories usage + Christophe Préaud + 2015-03-24 17:05:49 -0700 + Commit: 05c2214, github.com/apache/spark/pull/5165 + + Revert "[SPARK-5771] Number of Cores in Completed Applications of Standalone Master Web Page always be 0 if sc.stop() is called" + Andrew Or + 2015-03-24 16:49:27 -0700 + Commit: dd907d1 + + Revert "[SPARK-5771][UI][hotfix] Change Requested Cores into * if default cores is not set" + Andrew Or + 2015-03-24 16:41:31 -0700 + Commit: f7c3668 + + [SPARK-3570] Include time to open files in shuffle write time. + Kay Ousterhout + 2015-03-24 16:29:40 -0700 + Commit: d8ccf65, github.com/apache/spark/pull/4550 + + [SPARK-6088] Correct how tasks that get remote results are shown in UI. + Kay Ousterhout + 2015-03-24 16:26:43 -0700 + Commit: 6948ab6, github.com/apache/spark/pull/4839 + + [SPARK-6428][SQL] Added explicit types for all public methods in catalyst + Reynold Xin + 2015-03-24 16:03:55 -0700 + Commit: 7334801, github.com/apache/spark/pull/5162 + + [SPARK-6209] Clean up connections in ExecutorClassLoader after failing to load classes (master branch PR) + Josh Rosen + 2015-03-24 14:38:20 -0700 + Commit: 7215aa74, github.com/apache/spark/pull/4944 + + [SPARK-6458][SQL] Better error messages for invalid data sources + Michael Armbrust + 2015-03-24 14:10:56 -0700 + Commit: a8f51b8, github.com/apache/spark/pull/5158 + + [SPARK-6376][SQL] Avoid eliminating subqueries until optimization + Michael Armbrust + 2015-03-24 14:08:20 -0700 + Commit: cbeaf9e, github.com/apache/spark/pull/5160 + + [SPARK-6375][SQL] Fix formatting of error messages. + Michael Armbrust + 2015-03-24 13:22:46 -0700 + Commit: 046c1e2, github.com/apache/spark/pull/5155 + + [SPARK-6054][SQL] Fix transformations of TreeNodes that hold StructTypes + Michael Armbrust + 2015-03-24 12:28:01 -0700 + Commit: 3fa3d12, github.com/apache/spark/pull/5157 + + [SPARK-6437][SQL] Use completion iterator to close external sorter + Michael Armbrust + 2015-03-24 12:10:30 -0700 + Commit: 26c6ce3, github.com/apache/spark/pull/5161 + + [SPARK-6459][SQL] Warn when constructing trivially true equals predicate + Michael Armbrust + 2015-03-24 12:09:02 -0700 + Commit: 32efadd, github.com/apache/spark/pull/5163 + + [SPARK-6361][SQL] support adding a column with metadata in DF + Xiangrui Meng + 2015-03-24 12:08:19 -0700 + Commit: 6bdddb6, github.com/apache/spark/pull/5151 + + [SPARK-6475][SQL] recognize array types when infer data types from JavaBeans + Xiangrui Meng + 2015-03-24 10:11:27 -0700 + Commit: a1d1529, github.com/apache/spark/pull/5146 + + [ML][docs][minor] Define LabeledDocument/Document classes in CV example + Peter Rudenko + 2015-03-24 16:33:38 +0000 + Commit: 08d4528, github.com/apache/spark/pull/5135 + + [SPARK-5559] [Streaming] [Test] Remove oppotunity we met flakiness when running FlumeStreamSuite + Kousuke Saruta + 2015-03-24 16:13:25 +0000 + Commit: 85cf063, github.com/apache/spark/pull/4337 + + [SPARK-6473] [core] Do not try to figure out Scala version if not needed... + Marcelo Vanzin + 2015-03-24 13:48:33 +0000 + Commit: b293afc, github.com/apache/spark/pull/5143 + + Update the command to use IPython notebook + Cong Yue + 2015-03-24 12:56:13 +0000 + Commit: c12312f, github.com/apache/spark/pull/5111 + + [SPARK-6477][Build]: Run MIMA tests before the Spark test suite + Brennon York + 2015-03-24 10:33:04 +0000 + Commit: 37fac1d, github.com/apache/spark/pull/5145 + + [SPARK-6452] [SQL] Checks for missing attributes and unresolved operator for all types of operator + Cheng Lian + 2015-03-24 01:12:11 -0700 + Commit: 1afcf77, github.com/apache/spark/pull/5129 + + [SPARK-6428] Added explicit types for all public methods in core. + Reynold Xin + 2015-03-23 23:41:06 -0700 + Commit: 4ce2782, github.com/apache/spark/pull/5125 + + [SPARK-6124] Support jdbc connection properties in OPTIONS part of the query + Volodymyr Lyubinets + 2015-03-23 17:00:27 -0700 + Commit: bfd3ee9, github.com/apache/spark/pull/4859 + + Revert "[SPARK-6122][Core] Upgrade Tachyon client version to 0.6.1." + Patrick Wendell + 2015-03-23 15:08:39 -0700 + Commit: 6cd7058 + + [SPARK-6308] [MLlib] [Sql] Override TypeName in VectorUDT and MatrixUDT + MechCoder + 2015-03-23 13:30:21 -0700 + Commit: 474d132, github.com/apache/spark/pull/5118 + + [SPARK-6397][SQL] Check the missingInput simply + Yadong Qi + 2015-03-23 18:16:49 +0800 + Commit: 9f3273b, github.com/apache/spark/pull/5132 + + Revert "[SPARK-6397][SQL] Check the missingInput simply" + Cheng Lian + 2015-03-23 12:15:19 +0800 + Commit: bf044de + + [SPARK-6397][SQL] Check the missingInput simply + q00251598 + 2015-03-23 12:06:13 +0800 + Commit: e566fe5, github.com/apache/spark/pull/5082 + + [SPARK-4985] [SQL] parquet support for date type + Daoyuan Wang + 2015-03-23 11:46:16 +0800 + Commit: 4659468, github.com/apache/spark/pull/3822 + + [SPARK-6337][Documentation, SQL]Spark 1.3 doc fixes + vinodkc + 2015-03-22 20:00:08 +0000 + Commit: 2bf40c5, github.com/apache/spark/pull/5112 + + [HOTFIX] Build break due to https://github.com/apache/spark/pull/5128 + Reynold Xin + 2015-03-22 12:08:15 -0700 + Commit: 7a0da47 + + [SPARK-6122][Core] Upgrade Tachyon client version to 0.6.1. + Calvin Jia + 2015-03-22 11:11:29 -0700 + Commit: a41b9c6, github.com/apache/spark/pull/4867 + + SPARK-6454 [DOCS] Fix links to pyspark api + Kamil Smuga , stderr + 2015-03-22 15:56:25 +0000 + Commit: 6ef4863, github.com/apache/spark/pull/5120 + + [SPARK-6453][Mesos] Some Mesos*Suite have a different package with their classes + Jongyoul Lee + 2015-03-22 15:53:18 +0000 + Commit: adb2ff7, github.com/apache/spark/pull/5126 + + [SPARK-6455] [docs] Correct some mistakes and typos + Hangchen Yu + 2015-03-22 15:51:10 +0000 + Commit: ab4f516, github.com/apache/spark/pull/5128 + + [SPARK-6448] Make history server log parse exceptions + Ryan Williams + 2015-03-22 11:54:23 +0000 + Commit: b9fe504, github.com/apache/spark/pull/5122 + + [SPARK-6408] [SQL] Fix JDBCRDD filtering string literals + ypcat , Pei-Lun Lee + 2015-03-22 15:49:13 +0800 + Commit: 9b1e1f2, github.com/apache/spark/pull/5087 + + [SPARK-6428][SQL] Added explicit type for all public methods for Hive module + Reynold Xin + 2015-03-21 14:30:04 -0700 + Commit: b6090f9, github.com/apache/spark/pull/5108 + + [SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser. + Yin Huai + 2015-03-21 13:27:53 -0700 + Commit: 94a102a, github.com/apache/spark/pull/5078 + + [SPARK-5680][SQL] Sum function on all null values, should return zero + Venkata Ramana G , Venkata Ramana Gollamudi + 2015-03-21 13:24:24 -0700 + Commit: ee569a0, github.com/apache/spark/pull/4466 + + [SPARK-5320][SQL]Add statistics method at NoRelation (override super). + x1- + 2015-03-21 13:22:34 -0700 + Commit: 52dd4b2, github.com/apache/spark/pull/5105 + + [SPARK-5821] [SQL] JSON CTAS command should throw error message when delete path failure + Yanbo Liang , Yanbo Liang + 2015-03-21 11:23:28 +0800 + Commit: e5d2c37, github.com/apache/spark/pull/4610 + + [SPARK-6315] [SQL] Also tries the case class string parser while reading Parquet schema + Cheng Lian + 2015-03-21 11:18:45 +0800 + Commit: 937c1e5, github.com/apache/spark/pull/5034 + + [SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful + Yanbo Liang + 2015-03-21 10:53:04 +0800 + Commit: bc37c97, github.com/apache/spark/pull/5107 + + [SPARK-6025] [MLlib] Add helper method evaluateEachIteration to extract learning curve + MechCoder + 2015-03-20 17:14:09 -0700 + Commit: 25e271d, github.com/apache/spark/pull/4906 + + [SPARK-6428][SQL] Added explicit type for all public methods in sql/core + Reynold Xin + 2015-03-20 15:47:07 -0700 + Commit: a95043b, github.com/apache/spark/pull/5104 + + [SPARK-6421][MLLIB] _regression_train_wrapper does not test initialWeights correctly + lewuathe + 2015-03-20 17:18:18 -0400 + Commit: 257cde7, github.com/apache/spark/pull/5101 + + [SPARK-6309] [SQL] [MLlib] Implement MatrixUDT + MechCoder + 2015-03-20 17:13:18 -0400 + Commit: 11e0259, github.com/apache/spark/pull/5048 + + [SPARK-6423][Mesos] MemoryUtils should use memoryOverhead if it's set + Jongyoul Lee + 2015-03-20 19:14:35 +0000 + Commit: 49a01c7, github.com/apache/spark/pull/5099 + + [SPARK-5955][MLLIB] add checkpointInterval to ALS + Xiangrui Meng + 2015-03-20 15:02:57 -0400 + Commit: 6b36470, github.com/apache/spark/pull/5076 + + [Spark 6096][MLlib] Add Naive Bayes load save methods in Python + Xusen Yin + 2015-03-20 14:53:59 -0400 + Commit: 25636d9, github.com/apache/spark/pull/5090 + + [MLlib] SPARK-5954: Top by key + Shuo Xiang + 2015-03-20 14:45:44 -0400 + Commit: 5e6ad24, github.com/apache/spark/pull/5075 + + [SPARK-6095] [MLLIB] Support model save/load in Python's linear models + Yanbo Liang + 2015-03-20 14:44:21 -0400 + Commit: 48866f7, github.com/apache/spark/pull/5016 + + [SPARK-6371] [build] Update version to 1.4.0-SNAPSHOT. + Marcelo Vanzin + 2015-03-20 18:43:57 +0000 + Commit: a745645, github.com/apache/spark/pull/5056 + + [SPARK-6426][Doc]User could also point the yarn cluster config directory via YARN_CONF_DI... + WangTaoTheTonic + 2015-03-20 18:42:18 +0000 + Commit: 385b2ff, github.com/apache/spark/pull/5103 + + [SPARK-6370][core] Documentation: Improve all 3 docs for RDD.sample + mbonaci + 2015-03-20 18:30:45 +0000 + Commit: 28bcb9e, github.com/apache/spark/pull/5097 + + [SPARK-6428][MLlib] Added explicit type for public methods and implemented hashCode when equals is defined. + Reynold Xin + 2015-03-20 14:13:02 -0400 + Commit: db4d317, github.com/apache/spark/pull/5102 + + SPARK-6338 [CORE] Use standard temp dir mechanisms in tests to avoid orphaned temp files + Sean Owen + 2015-03-20 14:16:21 +0000 + Commit: 6f80c3e, github.com/apache/spark/pull/5029 + + SPARK-5134 [BUILD] Bump default Hadoop version to 2+ + Sean Owen + 2015-03-20 14:14:53 +0000 + Commit: d08e3eb, github.com/apache/spark/pull/5027 + + [SPARK-6286][Mesos][minor] Handle missing Mesos case TASK_ERROR + Jongyoul Lee + 2015-03-20 12:24:34 +0000 + Commit: 116c553, github.com/apache/spark/pull/5088 + + Tighten up field/method visibility in Executor and made some code more clear to read. + Reynold Xin + 2015-03-19 22:12:01 -0400 + Commit: 0745a30, github.com/apache/spark/pull/4850 + + [SPARK-6219] [Build] Check that Python code compiles + Nicholas Chammas + 2015-03-19 12:46:10 -0700 + Commit: f17d43b, github.com/apache/spark/pull/4941 + + [Core][minor] remove unused `visitedStages` in `DAGScheduler.stageDependsOn` + Wenchen Fan + 2015-03-19 15:25:32 -0400 + Commit: 3b5aaa6, github.com/apache/spark/pull/5086 + + [SPARK-5313][Project Infra]: Create simple framework for highlighting changes introduced in a PR + Brennon York + 2015-03-19 11:18:24 -0400 + Commit: 8cb23a1, github.com/apache/spark/pull/5072 + + [SPARK-6291] [MLLIB] GLM toString & toDebugString + Yanbo Liang + 2015-03-19 11:10:20 -0400 + Commit: dda4ded, github.com/apache/spark/pull/5038 + + [SPARK-5843] [API] Allowing map-side combine to be specified in Java. + mcheah + 2015-03-19 08:51:49 -0400 + Commit: 3c4e486, github.com/apache/spark/pull/4634 + + [SPARK-6402][DOC] - Remove some refererences to shark in docs and ec2 + Pierre Borckmans + 2015-03-19 08:02:06 -0400 + Commit: 797f8a0, github.com/apache/spark/pull/5083 + + [SPARK-4012] stop SparkContext when the exception is thrown from an infinite loop + CodingCat + 2015-03-18 23:48:45 -0700 + Commit: 2c3f83c, github.com/apache/spark/pull/5004 + + [SPARK-6222][Streaming] Dont delete checkpoint data when doing pre-batch-start checkpoint + Tathagata Das + 2015-03-19 02:15:50 -0400 + Commit: 645cf3f, github.com/apache/spark/pull/5008 + + [SPARK-6394][Core] cleanup BlockManager companion object and improve the getCacheLocs method in DAGScheduler + Wenchen Fan + 2015-03-18 19:43:04 -0700 + Commit: 540b2a4, github.com/apache/spark/pull/5043 + + SPARK-6085 Part. 2 Increase default value for memory overhead + Jongyoul Lee + 2015-03-18 20:54:22 -0400 + Commit: 3db1387, github.com/apache/spark/pull/5065 + + [SPARK-6374] [MLlib] add get for GeneralizedLinearAlgo + Yuhao Yang + 2015-03-18 13:44:37 -0400 + Commit: a95ee24, github.com/apache/spark/pull/5058 + + [SPARK-6325] [core,yarn] Do not change target executor count when killing executors. + Marcelo Vanzin + 2015-03-18 09:18:28 -0400 + Commit: 981fbaf, github.com/apache/spark/pull/5018 + + [SPARK-6286][minor] Handle missing Mesos case TASK_ERROR. + Iulian Dragos + 2015-03-18 09:15:33 -0400 + Commit: 9d112a9, github.com/apache/spark/pull/5000 + + SPARK-6389 YARN app diagnostics report doesn't report NPEs + Steve Loughran + 2015-03-18 09:09:32 -0400 + Commit: e09c852, github.com/apache/spark/pull/5070 + + [SPARK-6372] [core] Propagate --conf to child processes. + Marcelo Vanzin + 2015-03-18 09:06:57 -0400 + Commit: 6205a25, github.com/apache/spark/pull/5057 + + [SPARK-6247][SQL] Fix resolution of ambiguous joins caused by new aliases + Michael Armbrust + 2015-03-17 19:47:51 -0700 + Commit: 3579003, github.com/apache/spark/pull/5062 + + [SPARK-5651][SQL] Add input64 in blacklist and add test suit for create table within backticks + watermen , q00251598 + 2015-03-17 19:35:18 -0700 + Commit: a6ee2f7, github.com/apache/spark/pull/4427 + + [SPARK-5404] [SQL] Update the default statistic number + Cheng Hao + 2015-03-17 19:32:38 -0700 + Commit: 78cb08a, github.com/apache/spark/pull/4914 + + [SPARK-5908][SQL] Resolve UdtfsAlias when only single Alias is used + Liang-Chi Hsieh + 2015-03-17 18:58:52 -0700 + Commit: 5c80643, github.com/apache/spark/pull/4692 + + [SPARK-6383][SQL]Fixed compiler and errors in Dataframe examples + Tijo Thomas + 2015-03-17 18:50:19 -0700 + Commit: a012e08, github.com/apache/spark/pull/5068 + + [SPARK-6366][SQL] In Python API, the default save mode for save and saveAsTable should be "error" instead of "append". + Yin Huai + 2015-03-18 09:41:06 +0800 + Commit: dc9c919, github.com/apache/spark/pull/5053 + + [SPARK-6330] [SQL] Add a test case for SPARK-6330 + Pei-Lun Lee + 2015-03-18 08:34:46 +0800 + Commit: 4633a87, github.com/apache/spark/pull/5039 + + [SPARK-6226][MLLIB] add save/load in PySpark's KMeansModel + Xiangrui Meng + 2015-03-17 12:14:40 -0700 + Commit: c94d062, github.com/apache/spark/pull/5049 + + [SPARK-6336] LBFGS should document what convergenceTol means + lewuathe + 2015-03-17 12:11:57 -0700 + Commit: d9f3e01, github.com/apache/spark/pull/5033 + + [SPARK-6313] Add config option to disable file locks/fetchFile cache to ... + nemccarthy + 2015-03-17 09:33:11 -0700 + Commit: 4cca391, github.com/apache/spark/pull/5036 + + [SPARK-3266] Use intermediate abstract classes to fix type erasure issues in Java APIs + Josh Rosen + 2015-03-17 09:18:57 -0700 + Commit: 0f673c2, github.com/apache/spark/pull/5050 + + [SPARK-6365] jetty-security also needed for SPARK_PREPEND_CLASSES to work + Imran Rashid + 2015-03-17 09:41:06 -0500 + Commit: e9f22c6, github.com/apache/spark/pull/5052 + + [SPARK-6331] Load new master URL if present when recovering streaming context from checkpoint + Tathagata Das + 2015-03-17 05:31:27 -0700 + Commit: c928796, github.com/apache/spark/pull/5024 + + [docs] [SPARK-4820] Spark build encounters "File name too long" on some encrypted filesystems + Theodore Vasiloudis + 2015-03-17 11:25:01 +0000 + Commit: e26db9be, github.com/apache/spark/pull/5041 + + [SPARK-6269] [CORE] Use ScalaRunTime's array methods instead of java.lang.reflect.Array in size estimation + mcheah , Justin Uang + 2015-03-17 11:20:20 +0000 + Commit: 005d1c5, github.com/apache/spark/pull/4972 + + [SPARK-4011] tighten the visibility of the members in Master/Worker class + CodingCat + 2015-03-17 11:18:27 +0000 + Commit: 25f3580, github.com/apache/spark/pull/4844 + + SPARK-6044 [CORE] RDD.aggregate() should not use the closure serializer on the zero value + Sean Owen + 2015-03-16 23:58:52 -0700 + Commit: b2d8c02, github.com/apache/spark/pull/5028 + + [SPARK-6357][GraphX] Add unapply in EdgeContext + Takeshi YAMAMURO + 2015-03-16 23:54:54 -0700 + Commit: b3e6eca, github.com/apache/spark/pull/5047 + + [SQL][docs][minor] Fixed sample code in SQLContext scaladoc + Lomig Mégard + 2015-03-16 23:52:42 -0700 + Commit: 6870722, github.com/apache/spark/pull/5051 + + [SPARK-6299][CORE] ClassNotFoundException in standalone mode when running groupByKey with class defined in REPL + Kevin (Sangwoo) Kim + 2015-03-16 23:49:23 -0700 + Commit: f0edeae, github.com/apache/spark/pull/5046 + + [SPARK-5712] [SQL] fix comment with semicolon at end + Daoyuan Wang + 2015-03-17 12:29:15 +0800 + Commit: 9667b9f, github.com/apache/spark/pull/4500 + + [SPARK-6327] [PySpark] fix launch spark-submit from python + Davies Liu + 2015-03-16 16:26:55 -0700 + Commit: e3f315a, github.com/apache/spark/pull/5019 + + [SPARK-6077] Remove streaming tab while stopping StreamingContext + lisurprise + 2015-03-16 13:10:32 -0700 + Commit: f149b8b, github.com/apache/spark/pull/4828 + + [SPARK-6330] Fix filesystem bug in newParquet relation + Volodymyr Lyubinets + 2015-03-16 12:13:18 -0700 + Commit: d19efed, github.com/apache/spark/pull/5020 + + [SPARK-2087] [SQL] Multiple thriftserver sessions with single HiveContext instance + Cheng Hao + 2015-03-17 01:09:27 +0800 + Commit: 12a345a, github.com/apache/spark/pull/4885 + + [SPARK-6300][Spark Core] sc.addFile(path) does not support the relative path. + DoingDone9 <799203320@qq.com> + 2015-03-16 12:27:15 +0000 + Commit: 00e730b, github.com/apache/spark/pull/4993 + + [SPARK-5922][GraphX]: Add diff(other: RDD[VertexId, VD]) in VertexRDD + Brennon York + 2015-03-16 01:06:26 -0700 + Commit: 45f4c66, github.com/apache/spark/pull/4733 + + [SPARK-3619] Part 2. Upgrade to Mesos 0.21 to work around MESOS-1688 + Jongyoul Lee + 2015-03-15 15:46:55 +0000 + Commit: aa6536f, github.com/apache/spark/pull/4361 + + [SPARK-6285][SQL]Remove ParquetTestData in SparkBuild.scala and in README.md + OopsOutOfMemory + 2015-03-15 20:44:45 +0800 + Commit: 62ede53, github.com/apache/spark/pull/5032 + + [SPARK-5790][GraphX]: VertexRDD's won't zip properly for `diff` capability (added tests) + Brennon York + 2015-03-14 17:38:12 +0000 + Commit: c49d156, github.com/apache/spark/pull/5023 + + [SPARK-6329][Docs]: Minor doc changes for Mesos and TOC + Brennon York + 2015-03-14 17:28:13 +0000 + Commit: 127268b, github.com/apache/spark/pull/5022 + + [SPARK-6195] [SQL] Adds in-memory column type for fixed-precision decimals + Cheng Lian + 2015-03-14 19:53:54 +0800 + Commit: 5be6b0e, github.com/apache/spark/pull/4938 + + [SQL]Delete some dupliate code in HiveThriftServer2 + ArcherShao , ArcherShao + 2015-03-14 08:27:18 +0000 + Commit: ee15404, github.com/apache/spark/pull/5007 + + [SPARK-6210] [SQL] use prettyString as column name in agg() + Davies Liu + 2015-03-14 00:43:33 -0700 + Commit: b38e073, github.com/apache/spark/pull/5006 + + [SPARK-6317][SQL]Fixed HIVE console startup issue + vinodkc , Vinod K C + 2015-03-14 07:17:54 +0800 + Commit: e360d5e, github.com/apache/spark/pull/5011 + + [SPARK-6285] [SQL] Removes unused ParquetTestData and duplicated TestGroupWriteSupport + Cheng Lian + 2015-03-14 07:09:53 +0800 + Commit: cdc34ed, github.com/apache/spark/pull/5010 + + [SPARK-4600][GraphX]: org.apache.spark.graphx.VertexRDD.diff does not work + Brennon York + 2015-03-13 18:48:31 +0000 + Commit: b943f5d, github.com/apache/spark/pull/5015 + + [SPARK-6278][MLLIB] Mention the change of objective in linear regression + Xiangrui Meng + 2015-03-13 10:27:28 -0700 + Commit: 7f13434, github.com/apache/spark/pull/4978 + + [SPARK-6252] [mllib] Added getLambda to Scala NaiveBayes + Joseph K. Bradley , Joseph K. Bradley + 2015-03-13 10:26:09 -0700 + Commit: dc4abd4, github.com/apache/spark/pull/4969 + + [CORE][minor] remove unnecessary ClassTag in `DAGScheduler` + Wenchen Fan + 2015-03-13 14:08:56 +0000 + Commit: ea3d2ee, github.com/apache/spark/pull/4992 + + [SPARK-6197][CORE] handle json exception when hisotry file not finished writing + Zhang, Liye + 2015-03-13 13:59:54 +0000 + Commit: 9048e81, github.com/apache/spark/pull/4927 + + [SPARK-5310] [SQL] [DOC] Parquet section for the SQL programming guide + Cheng Lian + 2015-03-13 21:34:50 +0800 + Commit: 69ff8e8, github.com/apache/spark/pull/5001 + + [SPARK-5845][Shuffle] Time to cleanup spilled shuffle files not included in shuffle write time + Ilya Ganelin + 2015-03-13 13:21:04 +0000 + Commit: 0af9ea7, github.com/apache/spark/pull/4965 + + HOTFIX: Changes to release script. + Patrick Wendell + 2015-03-12 18:36:17 -0700 + Commit: 3980ebd + + [mllib] [python] Add LassoModel to __all__ in regression.py + Joseph K. Bradley + 2015-03-12 16:46:29 -0700 + Commit: 17c309c, github.com/apache/spark/pull/4970 + + [SPARK-4588] ML Attributes + Xiangrui Meng , Sean Owen + 2015-03-12 16:34:56 -0700 + Commit: a4b2716, github.com/apache/spark/pull/4925 + + [SPARK-6268][MLlib] KMeans parameter getter methods + Yuhao Yang + 2015-03-12 15:17:46 -0700 + Commit: fb4787c, github.com/apache/spark/pull/4974 + + [build] [hotfix] Fix make-distribution.sh for Scala 2.11. + Marcelo Vanzin + 2015-03-12 19:16:58 +0000 + Commit: 8f1bc79, github.com/apache/spark/pull/5002 + + [SPARK-6275][Documentation]Miss toDF() function in docs/sql-programming-guide.md + zzcclp + 2015-03-12 15:07:15 +0000 + Commit: 304366c, github.com/apache/spark/pull/4977 + + [docs] [SPARK-6306] Readme points to dead link + Theodore Vasiloudis + 2015-03-12 15:01:33 +0000 + Commit: 4e47d54, github.com/apache/spark/pull/4999 + + [SPARK-5814][MLLIB][GRAPHX] Remove JBLAS from runtime + Xiangrui Meng + 2015-03-12 01:39:04 -0700 + Commit: 0cba802, github.com/apache/spark/pull/4699 + + [SPARK-6294] fix hang when call take() in JVM on PythonRDD + Davies Liu + 2015-03-12 01:34:38 -0700 + Commit: 712679a, github.com/apache/spark/pull/4987 + + [SPARK-6296] [SQL] Added equals to Column + Volodymyr Lyubinets + 2015-03-12 00:55:26 -0700 + Commit: 25b71d8, github.com/apache/spark/pull/4988 + + BUILD: Adding more known contributor names + Patrick Wendell + 2015-03-11 22:24:08 -0700 + Commit: e921a66 + + [SPARK-6128][Streaming][Documentation] Updates to Spark Streaming Programming Guide + Tathagata Das + 2015-03-11 18:48:21 -0700 + Commit: cd3b68d, github.com/apache/spark/pull/4956 + + [SPARK-6274][Streaming][Examples] Added examples streaming + sql examples. + Tathagata Das + 2015-03-11 11:19:51 -0700 + Commit: 51a79a7, github.com/apache/spark/pull/4975 + + SPARK-6245 [SQL] jsonRDD() of empty RDD results in exception + Sean Owen + 2015-03-11 14:09:09 +0000 + Commit: 55c4831, github.com/apache/spark/pull/4971 + + SPARK-3642. Document the nuances of shared variables. + Sandy Ryza + 2015-03-11 13:22:05 +0000 + Commit: 2d87a41, github.com/apache/spark/pull/2490 + + [SPARK-4423] Improve foreach() documentation to avoid confusion between local- and cluster-mode behavior + Ilya Ganelin + 2015-03-11 13:20:15 +0000 + Commit: 548643a, github.com/apache/spark/pull/4696 + + [SPARK-6228] [network] Move SASL classes from network/shuffle to network... + Marcelo Vanzin + 2015-03-11 13:16:22 +0000 + Commit: 5b335bd, github.com/apache/spark/pull/4953 + + SPARK-6225 [CORE] [SQL] [STREAMING] Resolve most build warnings, 1.3.0 edition + Sean Owen + 2015-03-11 13:15:19 +0000 + Commit: 6e94c4e, github.com/apache/spark/pull/4950 + + [SPARK-6279][Streaming]In KafkaRDD.scala, Miss expressions flag "s" at logging string + zzcclp + 2015-03-11 12:22:24 +0000 + Commit: ec30c17, github.com/apache/spark/pull/4979 + + [SQL][Minor] fix typo in comments + Hongbo Liu + 2015-03-11 12:18:24 +0000 + Commit: 40f4979, github.com/apache/spark/pull/4976 + + [MINOR] [DOCS] Fix map -> mapToPair in Streaming Java example + Sean Owen + 2015-03-11 12:16:32 +0000 + Commit: 35b2564, github.com/apache/spark/pull/4967 + + [SPARK-4924] Add a library for launching Spark jobs programmatically. + Marcelo Vanzin + 2015-03-11 01:03:01 -0700 + Commit: 517975d, github.com/apache/spark/pull/3916 + + [SPARK-5986][MLLib] Add save/load for k-means + Xusen Yin + 2015-03-11 00:24:55 -0700 + Commit: 2d4e00e, github.com/apache/spark/pull/4951 + + [SPARK-5183][SQL] Update SQL Docs with JDBC and Migration Guide + Michael Armbrust + 2015-03-10 18:13:09 -0700 + Commit: 2672374, github.com/apache/spark/pull/4958 + + Minor doc: Remove the extra blank line in data types javadoc. + Reynold Xin + 2015-03-10 17:25:04 -0700 + Commit: 74fb433, github.com/apache/spark/pull/4955 + + [SPARK-6186] [EC2] Make Tachyon version configurable in EC2 deployment script + cheng chang + 2015-03-10 11:02:12 +0000 + Commit: 7c7d2d5, github.com/apache/spark/pull/4901 + + [SPARK-6191] [EC2] Generalize ability to download libs + Nicholas Chammas + 2015-03-10 10:58:31 +0000 + Commit: d14df06, github.com/apache/spark/pull/4919 + + [SPARK-6087][CORE] Provide actionable exception if Kryo buffer is not large enough + Lev Khomich + 2015-03-10 10:55:42 +0000 + Commit: c4c4b07, github.com/apache/spark/pull/4947 + + [SPARK-6177][MLlib]Add note in LDA example to remind possible coalesce + Yuhao Yang + 2015-03-10 10:51:44 +0000 + Commit: 9a0272f, github.com/apache/spark/pull/4899 + + [SPARK-6194] [SPARK-677] [PySpark] fix memory leak in collect() + Davies Liu + 2015-03-09 16:24:06 -0700 + Commit: 8767565, github.com/apache/spark/pull/4923 + + [SPARK-5310][Doc] Update SQL Programming Guide to include DataFrames. + Reynold Xin + 2015-03-09 16:16:16 -0700 + Commit: 3cac199, github.com/apache/spark/pull/4954 + + [Docs] Replace references to SchemaRDD with DataFrame + Reynold Xin + 2015-03-09 13:29:19 -0700 + Commit: 70f8814, github.com/apache/spark/pull/4952 + + [EC2] [SPARK-6188] Instance types can be mislabeled when re-starting cluster with default arguments + Theodore Vasiloudis , Theodore Vasiloudis + 2015-03-09 14:16:07 +0000 + Commit: f7c7992, github.com/apache/spark/pull/4916 + + [GraphX] Improve LiveJournalPageRank example + Jacky Li + 2015-03-08 19:47:35 +0000 + Commit: 55b1b32, github.com/apache/spark/pull/4917 + + SPARK-6205 [CORE] UISeleniumSuite fails for Hadoop 2.x test with NoClassDefFoundError + Sean Owen + 2015-03-08 14:09:40 +0000 + Commit: f16b7b0, github.com/apache/spark/pull/4933 + + [SPARK-6193] [EC2] Push group filter up to EC2 + Nicholas Chammas + 2015-03-08 14:01:26 +0000 + Commit: 52ed7da, github.com/apache/spark/pull/4922 + + [SPARK-5641] [EC2] Allow spark_ec2.py to copy arbitrary files to cluster + Florian Verhein + 2015-03-07 12:56:59 +0000 + Commit: 334c5bd, github.com/apache/spark/pull/4583 + + [Minor]fix the wrong description + WangTaoTheTonic + 2015-03-07 12:35:26 +0000 + Commit: 729c05b, github.com/apache/spark/pull/4936 + + [EC2] Reorder print statements on termination + Nicholas Chammas + 2015-03-07 12:33:41 +0000 + Commit: 2646794, github.com/apache/spark/pull/4932 + + Fix python typo (+ Scala, Java typos) + RobertZK , Robert Krzyzanowski + 2015-03-07 00:16:50 +0000 + Commit: 48a723c, github.com/apache/spark/pull/4840 + + [SPARK-6178][Shuffle] Removed unused imports + Vinod K C , Vinod K C + 2015-03-06 14:43:09 +0000 + Commit: dba0b2e, github.com/apache/spark/pull/4900 + + [Minor] Resolve sbt warnings: postfix operator second should be enabled + GuoQiang Li + 2015-03-06 13:20:20 +0000 + Commit: 05cb6b3, github.com/apache/spark/pull/4908 + + [core] [minor] Don't pollute source directory when running UtilsSuite. + Marcelo Vanzin + 2015-03-06 09:43:24 +0000 + Commit: cd7594c, github.com/apache/spark/pull/4921 + + [CORE, DEPLOY][minor] align arguments order with docs of worker + Zhang, Liye + 2015-03-06 09:34:07 +0000 + Commit: d8b3da9, github.com/apache/spark/pull/4924 + + [SQL] Make Strategies a public developer API + Michael Armbrust + 2015-03-05 14:50:25 -0800 + Commit: eb48fd6, github.com/apache/spark/pull/4920 + + [SPARK-6163][SQL] jsonFile should be backed by the data source API + Yin Huai + 2015-03-05 14:49:44 -0800 + Commit: 1b4bb25, github.com/apache/spark/pull/4896 + + [SPARK-6145][SQL] fix ORDER BY on nested fields + Wenchen Fan , Michael Armbrust + 2015-03-05 14:49:01 -0800 + Commit: 5873c71, github.com/apache/spark/pull/4918 + + [SPARK-6175] Fix standalone executor log links when ephemeral ports or SPARK_PUBLIC_DNS are used + Josh Rosen + 2015-03-05 12:04:00 -0800 + Commit: 424a86a, github.com/apache/spark/pull/4903 + + [SPARK-6090][MLLIB] add a basic BinaryClassificationMetrics to PySpark/MLlib + Xiangrui Meng + 2015-03-05 11:50:09 -0800 + Commit: 0bfacd5, github.com/apache/spark/pull/4863 + + SPARK-6182 [BUILD] spark-parent pom needs to be published for both 2.10 and 2.11 + Sean Owen + 2015-03-05 11:31:48 -0800 + Commit: c9cfba0, github.com/apache/spark/pull/4912 + + [SPARK-6153] [SQL] promote guava dep for hive-thriftserver + Daoyuan Wang + 2015-03-05 16:35:17 +0800 + Commit: e06c7df, github.com/apache/spark/pull/4884 + + SPARK-5143 [BUILD] [WIP] spark-network-yarn 2.11 depends on spark-network-shuffle 2.10 + Sean Owen + 2015-03-04 21:00:51 -0800 + Commit: 7ac072f, github.com/apache/spark/pull/4876 + + [SPARK-6149] [SQL] [Build] Excludes Guava 15 referenced by jackson-module-scala_2.10 + Cheng Lian + 2015-03-04 20:52:58 -0800 + Commit: 1aa90e3, github.com/apache/spark/pull/4890 + + [SPARK-6144] [core] Fix addFile when source files are on "hdfs:" + Marcelo Vanzin , trystanleftwich + 2015-03-04 12:58:39 -0800 + Commit: 3a35a0d, github.com/apache/spark/pull/4894 + + [SPARK-6107][CORE] Display inprogress application information for event log history for standalone mode + Zhang, Liye + 2015-03-04 12:28:27 +0000 + Commit: f6773ed, github.com/apache/spark/pull/4848 + + [SPARK-6134][SQL] Fix wrong datatype for casting FloatType and default LongType value in defaultPrimitive + Liang-Chi Hsieh + 2015-03-04 20:23:43 +0800 + Commit: aef8a84, github.com/apache/spark/pull/4870 + + [SPARK-6136] [SQL] Removed JDBC integration tests which depends on docker-client + Cheng Lian + 2015-03-04 19:39:02 +0800 + Commit: 76b472f, github.com/apache/spark/pull/4872 + + [SPARK-3355][Core]: Allow running maven tests in run-tests + Brennon York + 2015-03-04 11:02:33 +0000 + Commit: 418f38d, github.com/apache/spark/pull/4734 + + SPARK-6085 Increase default value for memory overhead + tedyu + 2015-03-04 11:00:52 +0000 + Commit: 8d3e241, github.com/apache/spark/pull/4836 + + [SPARK-6141][MLlib] Upgrade Breeze from 0.10 to 0.11 to fix convergence bug + Xiangrui Meng , DB Tsai , DB Tsai + 2015-03-03 23:52:02 -0800 + Commit: 76e20a0, github.com/apache/spark/pull/4879 + + [SPARK-6132][HOTFIX] ContextCleaner InterruptedException should be quiet + Andrew Or + 2015-03-03 20:49:45 -0800 + Commit: d334bfb, github.com/apache/spark/pull/4882 + + [SPARK-5949] HighlyCompressedMapStatus needs more classes registered w/ kryo + Imran Rashid + 2015-03-03 15:33:19 -0800 + Commit: 1f1fccc, github.com/apache/spark/pull/4877 + + [SPARK-6133] Make sc.stop() idempotent + Andrew Or + 2015-03-03 15:09:57 -0800 + Commit: 6c20f35, github.com/apache/spark/pull/4871 + + [SPARK-6132] ContextCleaner race condition across SparkContexts + Andrew Or + 2015-03-03 13:44:05 -0800 + Commit: fe63e82, github.com/apache/spark/pull/4869 + + SPARK-1911 [DOCS] Warn users if their assembly jars are not built with Java 6 + Sean Owen + 2015-03-03 13:40:11 -0800 + Commit: e750a6b, github.com/apache/spark/pull/4874 + + Revert "[SPARK-5423][Core] Cleanup resources in DiskMapIterator.finalize to ensure deleting the temp file" + Andrew Or + 2015-03-03 13:03:52 -0800 + Commit: 9af0017 + + [SPARK-6138][CORE][minor] enhance the `toArray` method in `SizeTrackingVector` + Wenchen Fan + 2015-03-03 12:12:23 +0000 + Commit: e359794, github.com/apache/spark/pull/4825 + + [SPARK-6118] making package name of deploy.worker.CommandUtils and deploy.CommandUtilsSuite consistent + CodingCat + 2015-03-03 10:32:57 +0000 + Commit: 975643c, github.com/apache/spark/pull/4856 + + BUILD: Minor tweaks to internal build scripts + Patrick Wendell + 2015-03-03 00:38:12 -0800 + Commit: 0c9a8ea + + HOTFIX: Bump HBase version in MapR profiles. + Patrick Wendell + 2015-03-03 01:38:07 -0800 + Commit: 165ff36 + + [SPARK-5537][MLlib][Docs] Add user guide for multinomial logistic regression + DB Tsai + 2015-03-02 22:37:12 -0800 + Commit: b196056, github.com/apache/spark/pull/4866 + + [SPARK-6120] [mllib] Warnings about memory in tree, ensemble model save + Joseph K. Bradley + 2015-03-02 22:33:51 -0800 + Commit: c2fe3a6, github.com/apache/spark/pull/4864 + + [SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib + Xiangrui Meng + 2015-03-02 22:27:01 -0800 + Commit: 7e53a79, github.com/apache/spark/pull/4854 + + [SPARK-5310][SQL] Fixes to Docs and Datasources API + Reynold Xin , Michael Armbrust + 2015-03-02 22:14:08 -0800 + Commit: 54d1968, github.com/apache/spark/pull/4868 + + [SPARK-5950][SQL]Insert array into a metastore table saved as parquet should work when using datasource api + Yin Huai + 2015-03-02 19:31:55 -0800 + Commit: 1259994, github.com/apache/spark/pull/4826 + + [SPARK-6127][Streaming][Docs] Add Kafka to Python api docs + Tathagata Das + 2015-03-02 18:40:46 -0800 + Commit: 9eb22ec, github.com/apache/spark/pull/4860 + + [SPARK-5537] Add user guide for multinomial logistic regression + Xiangrui Meng , DB Tsai + 2015-03-02 18:10:50 -0800 + Commit: 9d6c5ae, github.com/apache/spark/pull/4801 + + [SPARK-6121][SQL][MLLIB] simpleString for UDT + Xiangrui Meng + 2015-03-02 17:14:34 -0800 + Commit: 2db6a85, github.com/apache/spark/pull/4858 + + [SPARK-4777][CORE] Some block memory after unrollSafely not count into used memory(memoryStore.entrys or unrollMemory) + hushan[胡珊] + 2015-03-02 16:53:54 -0800 + Commit: e3a88d1, github.com/apache/spark/pull/3629 + + [SPARK-6048] SparkConf should not translate deprecated configs on set + Andrew Or + 2015-03-02 16:36:42 -0800 + Commit: 258d154, github.com/apache/spark/pull/4799 + + [SPARK-6066] Make event log format easier to parse + Andrew Or + 2015-03-02 16:34:32 -0800 + Commit: 6776cb3, github.com/apache/spark/pull/4821 + + [SPARK-6082] [SQL] Provides better error message for malformed rows when caching tables + Cheng Lian + 2015-03-02 16:18:00 -0800 + Commit: 1a49496, github.com/apache/spark/pull/4842 + + [SPARK-6114][SQL] Avoid metastore conversions before plan is resolved + Michael Armbrust + 2015-03-02 16:10:54 -0800 + Commit: 8223ce6, github.com/apache/spark/pull/4855 + + [SPARK-5522] Accelerate the Histroty Server start + guliangliang + 2015-03-02 15:33:23 -0800 + Commit: 26c1c56, github.com/apache/spark/pull/4525 + + [SPARK-6050] [yarn] Relax matching of vcore count in received containers. + Marcelo Vanzin + 2015-03-02 16:41:43 -0600 + Commit: 6b348d9, github.com/apache/spark/pull/4818 + + [SPARK-6040][SQL] Fix the percent bug in tablesample + q00251598 + 2015-03-02 13:16:29 -0800 + Commit: 582e5a2, github.com/apache/spark/pull/4789 + + [Minor] Fix doc typo for describing primitiveTerm effectiveness condition + Liang-Chi Hsieh + 2015-03-02 13:11:17 -0800 + Commit: 3f9def8, github.com/apache/spark/pull/4762 + + SPARK-5390 [DOCS] Encourage users to post on Stack Overflow in Community Docs + Sean Owen + 2015-03-02 21:10:08 +0000 + Commit: 0b472f6, github.com/apache/spark/pull/4843 + + [DOCS] Refactored Dataframe join comment to use correct parameter ordering + Paul Power + 2015-03-02 13:08:47 -0800 + Commit: d9a8bae, github.com/apache/spark/pull/4847 + + [SPARK-6080] [PySpark] correct LogisticRegressionWithLBFGS regType parameter for pyspark + Yanbo Liang + 2015-03-02 10:17:24 -0800 + Commit: af2effd, github.com/apache/spark/pull/4831 + + aggregateMessages example in graphX doc + DEBORAH SIEGEL + 2015-03-02 10:15:32 -0800 + Commit: e7d8ae4, github.com/apache/spark/pull/4853 + + [SPARK-5741][SQL] Support the path contains comma in HiveContext + q00251598 + 2015-03-02 10:13:11 -0800 + Commit: 9ce12aa, github.com/apache/spark/pull/4532 + + [SPARK-6111] Fixed usage string in documentation. + Kenneth Myers + 2015-03-02 17:25:24 +0000 + Commit: 95ac68b, github.com/apache/spark/pull/4852 + + [SPARK-6052][SQL]In JSON schema inference, we should always set containsNull of an ArrayType to true + Yin Huai + 2015-03-02 23:18:07 +0800 + Commit: 3efd8bb, github.com/apache/spark/pull/4806 + + [SPARK-6073][SQL] Need to refresh metastore cache after append data in CreateMetastoreDataSourceAsSelect + Yin Huai + 2015-03-02 22:42:18 +0800 + Commit: 39a54b4, github.com/apache/spark/pull/4824 + + [SPARK-6103][Graphx]remove unused class to import in EdgeRDDImpl + Lianhui Wang + 2015-03-02 09:06:56 +0000 + Commit: 49c7a8f, github.com/apache/spark/pull/4846 + + SPARK-3357 [CORE] Internal log messages should be set at DEBUG level instead of INFO + Sean Owen + 2015-03-02 08:51:03 +0000 + Commit: 948c239, github.com/apache/spark/pull/4838 + + [Streaming][Minor]Fix some error docs in streaming examples + Saisai Shao + 2015-03-02 08:49:19 +0000 + Commit: d8fb40e, github.com/apache/spark/pull/4837 + + [SPARK-6083] [MLLib] [DOC] Make Python API example consistent in NaiveBayes + MechCoder + 2015-03-01 16:28:15 -0800 + Commit: 3f00bb3, github.com/apache/spark/pull/4834 + + [SPARK-6053][MLLIB] support save/load in PySpark's ALS + Xiangrui Meng + 2015-03-01 16:26:57 -0800 + Commit: aedbbaa, github.com/apache/spark/pull/4811 + + [SPARK-6074] [sql] Package pyspark sql bindings. + Marcelo Vanzin + 2015-03-01 11:05:10 +0000 + Commit: fd8d283, github.com/apache/spark/pull/4822 + + [SPARK-6075] Fix bug in that caused lost accumulator updates: do not store WeakReferences in localAccums map + Josh Rosen + 2015-02-28 22:51:01 -0800 + Commit: 2df5f1f, github.com/apache/spark/pull/4835 + + SPARK-5984: Fix TimSort bug causes ArrayOutOfBoundsException + Evan Yu + 2015-02-28 18:55:34 -0800 + Commit: 643300a, github.com/apache/spark/pull/4804 + + SPARK-1965 [WEBUI] Spark UI throws NPE on trying to load the app page for non-existent app + Sean Owen + 2015-02-28 15:34:08 +0000 + Commit: 86fcdae, github.com/apache/spark/pull/4777 + + SPARK-5983 [WEBUI] Don't respond to HTTP TRACE in HTTP-based UIs + Sean Owen + 2015-02-28 15:23:59 +0000 + Commit: f91298e, github.com/apache/spark/pull/4765 + + SPARK-6063 MLlib doesn't pass mvn scalastyle check due to UTF chars in LDAModel.scala + Michael Griffiths , Griffiths, Michael (NYC-RPM) + 2015-02-28 14:47:39 +0000 + Commit: b36b1bc, github.com/apache/spark/pull/4815 + + [SPARK-5775] [SQL] BugFix: GenericRow cannot be cast to SpecificMutableRow when nested data and partitioned table + Cheng Lian , Cheng Lian , Yin Huai + 2015-02-28 21:15:43 +0800 + Commit: e6003f0, github.com/apache/spark/pull/4792 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-02-27 23:10:09 -0800 + Commit: 9168259, github.com/apache/spark/pull/1128 + + [SPARK-5979][SPARK-6032] Smaller safer --packages fix + Burak Yavuz + 2015-02-27 22:59:35 -0800 + Commit: 6d8e5fb, github.com/apache/spark/pull/4802 + + [SPARK-6070] [yarn] Remove unneeded classes from shuffle service jar. + Marcelo Vanzin + 2015-02-27 22:44:11 -0800 + Commit: dba08d1, github.com/apache/spark/pull/4820 + + [SPARK-6055] [PySpark] fix incorrect __eq__ of DataType + Davies Liu + 2015-02-27 20:07:17 -0800 + Commit: e0e64ba, github.com/apache/spark/pull/4808 + + [SPARK-5751] [SQL] Sets SPARK_HOME as SPARK_PID_DIR when running Thrift server test suites + Cheng Lian + 2015-02-28 08:41:49 +0800 + Commit: 8c468a6, github.com/apache/spark/pull/4758 + + [Streaming][Minor] Remove useless type signature of Java Kafka direct stream API + Saisai Shao + 2015-02-27 13:01:42 -0800 + Commit: 5f7f3b9, github.com/apache/spark/pull/4817 + + [SPARK-4587] [mllib] [docs] Fixed save,load calls in ML guide examples + Joseph K. Bradley + 2015-02-27 13:00:36 -0800 + Commit: d17cb2b, github.com/apache/spark/pull/4816 + + [SPARK-6059][Yarn] Add volatile to ApplicationMaster's reporterThread and allocator + zsxwing + 2015-02-27 13:33:39 +0000 + Commit: 57566d0, github.com/apache/spark/pull/4814 + + [SPARK-6058][Yarn] Log the user class exception in ApplicationMaster + zsxwing + 2015-02-27 13:31:46 +0000 + Commit: e747e98, github.com/apache/spark/pull/4813 + + [SPARK-6036][CORE] avoid race condition between eventlogListener and akka actor system + Zhang, Liye + 2015-02-26 23:11:43 -0800 + Commit: 8cd1692, github.com/apache/spark/pull/4785 + + fix spark-6033, clarify the spark.worker.cleanup behavior in standalone mode + 许鹏 + 2015-02-26 23:05:56 -0800 + Commit: 0375a41, github.com/apache/spark/pull/4803 + + [SPARK-6046] Privatize SparkConf.translateConfKey + Andrew Or + 2015-02-26 22:39:46 -0800 + Commit: 7c99a01, github.com/apache/spark/pull/4797 + + SPARK-2168 [Spark core] Use relative URIs for the app links in the History Server. + Lukasz Jastrzebski + 2015-02-26 22:38:06 -0800 + Commit: 4a8a0a8, github.com/apache/spark/pull/4778 + + [SPARK-5495][UI] Add app and driver kill function in master web UI + jerryshao + 2015-02-26 22:36:48 -0800 + Commit: 67595eb, github.com/apache/spark/pull/4288 + + [SPARK-5771][UI][hotfix] Change Requested Cores into * if default cores is not set + jerryshao + 2015-02-26 22:35:43 -0800 + Commit: 12135e9, github.com/apache/spark/pull/4800 + + [SPARK-6024][SQL] When a data source table has too many columns, it's schema cannot be stored in metastore. + Yin Huai + 2015-02-26 20:46:05 -0800 + Commit: 5e5ad65, github.com/apache/spark/pull/4795 + + [SPARK-6037][SQL] Avoiding duplicate Parquet schema merging + Liang-Chi Hsieh + 2015-02-27 11:06:47 +0800 + Commit: 4ad5153, github.com/apache/spark/pull/4786 + + [SPARK-5529][CORE]Add expireDeadHosts in HeartbeatReceiver + Hong Shen + 2015-02-26 18:43:23 -0800 + Commit: 18f2098, github.com/apache/spark/pull/4363 + + SPARK-4579 [WEBUI] Scheduling Delay appears negative + Sean Owen + 2015-02-26 17:35:09 -0800 + Commit: fbc4694, github.com/apache/spark/pull/4796 + + SPARK-6045 RecordWriter should be checked against null in PairRDDFunctio... + tedyu + 2015-02-26 23:26:07 +0000 + Commit: e60ad2f, github.com/apache/spark/pull/4794 + + [SPARK-5951][YARN] Remove unreachable driver memory properties in yarn client mode + mohit.goyal + 2015-02-26 14:27:47 -0800 + Commit: b38dec2, github.com/apache/spark/pull/4730 + + Add a note for context termination for History server on Yarn + moussa taifi + 2015-02-26 14:19:43 -0800 + Commit: c871e2d, github.com/apache/spark/pull/4721 + + SPARK-4300 [CORE] Race condition during SparkWorker shutdown + Sean Owen + 2015-02-26 14:08:56 -0800 + Commit: 3fb53c0, github.com/apache/spark/pull/4787 + + [SPARK-6018] [YARN] NoSuchMethodError in Spark app is swallowed by YARN AM + Cheolsoo Park + 2015-02-26 13:53:49 -0800 + Commit: 5f3238b, github.com/apache/spark/pull/4773 + + [SPARK-6027][SPARK-5546] Fixed --jar and --packages not working for KafkaUtils and improved error message + Tathagata Das + 2015-02-26 13:46:07 -0800 + Commit: aa63f63, github.com/apache/spark/pull/4779 + + [SPARK-3562]Periodic cleanup event logs + xukun 00228947 + 2015-02-26 13:24:00 -0800 + Commit: 8942b52, github.com/apache/spark/pull/4214 + + Modify default value description for spark.scheduler.minRegisteredResourcesRatio on docs. + Li Zhihui + 2015-02-26 13:07:07 -0800 + Commit: 10094a5, github.com/apache/spark/pull/4781 + + SPARK-4704 [CORE] SparkSubmitDriverBootstrap doesn't flush output + Sean Owen + 2015-02-26 12:56:54 -0800 + Commit: cd5c8d7, github.com/apache/spark/pull/4788 + + [SPARK-5363] Fix bug in PythonRDD: remove() inside iterator is not safe + Davies Liu + 2015-02-26 11:54:17 -0800 + Commit: 7fa960e, github.com/apache/spark/pull/4776 + + [SPARK-6004][MLlib] Pick the best model when training GradientBoostedTrees with validation + Liang-Chi Hsieh + 2015-02-26 10:51:47 -0800 + Commit: cfff397, github.com/apache/spark/pull/4763 + + [SPARK-6007][SQL] Add numRows param in DataFrame.show() + Jacky Li + 2015-02-26 10:40:58 -0800 + Commit: 2358657, github.com/apache/spark/pull/4767 + + [SPARK-5801] [core] Avoid creating nested directories. + Marcelo Vanzin + 2015-02-26 17:35:03 +0000 + Commit: df3d559, github.com/apache/spark/pull/4747 + + [SPARK-6016][SQL] Cannot read the parquet table after overwriting the existing table when spark.sql.parquet.cacheMetadata=true + Yin Huai + 2015-02-27 01:01:32 +0800 + Commit: 192e42a, github.com/apache/spark/pull/4775 + + [SPARK-6023][SQL] ParquetConversions fails to replace the destination MetastoreRelation of an InsertIntoTable node to ParquetRelation2 + Yin Huai + 2015-02-26 22:39:49 +0800 + Commit: f02394d, github.com/apache/spark/pull/4782 + + [SPARK-5914] to run spark-submit requiring only user perm on windows + Judy Nash + 2015-02-26 11:14:37 +0000 + Commit: 51a6f90, github.com/apache/spark/pull/4742 + + [SPARK-5976][MLLIB] Add partitioner to factors returned by ALS + Xiangrui Meng + 2015-02-25 23:43:29 -0800 + Commit: e43139f, github.com/apache/spark/pull/4748 + + [SPARK-5974] [SPARK-5980] [mllib] [python] [docs] Update ML guide with save/load, Python GBT + Joseph K. Bradley + 2015-02-25 16:13:17 -0800 + Commit: d20559b, github.com/apache/spark/pull/4750 + + [SPARK-1182][Docs] Sort the configuration parameters in configuration.md + Brennon York + 2015-02-25 16:12:56 -0800 + Commit: 46a044a, github.com/apache/spark/pull/3863 + + [SPARK-5926] [SQL] make DataFrame.explain leverage queryExecution.logical + Yanbo Liang + 2015-02-25 15:37:13 -0800 + Commit: 41e2e5a, github.com/apache/spark/pull/4707 + + [SPARK-5999][SQL] Remove duplicate Literal matching block + Liang-Chi Hsieh + 2015-02-25 15:22:33 -0800 + Commit: 12dbf98, github.com/apache/spark/pull/4760 + + [SPARK-6010] [SQL] Merging compatible Parquet schemas before computing splits + Cheng Lian + 2015-02-25 15:15:22 -0800 + Commit: e0fdd46, github.com/apache/spark/pull/4768 + + [SPARK-5944] [PySpark] fix version in Python API docs + Davies Liu + 2015-02-25 15:13:34 -0800 + Commit: f3f4c87, github.com/apache/spark/pull/4731 + + [SPARK-5982] Remove incorrect Local Read Time Metric + Kay Ousterhout + 2015-02-25 14:55:24 -0800 + Commit: 838a480, github.com/apache/spark/pull/4749 + + [SPARK-1955][GraphX]: VertexRDD can incorrectly assume index sharing + Brennon York + 2015-02-25 14:11:12 -0800 + Commit: 9f603fc, github.com/apache/spark/pull/4705 + + [SPARK-5970][core] Register directory created in getOrCreateLocalRootDirs for automatic deletion. + Milan Straka + 2015-02-25 21:33:34 +0000 + Commit: a777c65, github.com/apache/spark/pull/4759 + + SPARK-5930 [DOCS] Documented default of spark.shuffle.io.retryWait is confusing + Sean Owen + 2015-02-25 12:20:44 -0800 + Commit: 7d8e6a2, github.com/apache/spark/pull/4769 + + [SPARK-5996][SQL] Fix specialized outbound conversions + Michael Armbrust + 2015-02-25 10:13:40 -0800 + Commit: f84c799, github.com/apache/spark/pull/4757 + + [SPARK-5771] Number of Cores in Completed Applications of Standalone Master Web Page always be 0 if sc.stop() is called + guliangliang + 2015-02-25 14:48:02 +0000 + Commit: dd077ab, github.com/apache/spark/pull/4567 + + [GraphX] fixing 3 typos in the graphx programming guide + Benedikt Linse + 2015-02-25 14:46:17 +0000 + Commit: 5b8480e, github.com/apache/spark/pull/4766 + + [SPARK-5666][streaming][MQTT streaming] some trivial fixes + prabs , Prabeesh K + 2015-02-25 14:37:35 +0000 + Commit: d51ed26, github.com/apache/spark/pull/4178 + + [SPARK-5994] [SQL] Python DataFrame documentation fixes + Davies Liu + 2015-02-24 20:51:55 -0800 + Commit: d641fbb, github.com/apache/spark/pull/4756 + + [SPARK-5286][SQL] SPARK-5286 followup + Yin Huai + 2015-02-24 19:51:36 -0800 + Commit: 769e092, github.com/apache/spark/pull/4755 + + [SPARK-5993][Streaming][Build] Fix assembly jar location of kafka-assembly + Tathagata Das + 2015-02-24 19:10:37 -0800 + Commit: 922b43b, github.com/apache/spark/pull/4753 + + [SPARK-5985][SQL] DataFrame sortBy -> orderBy in Python. + Reynold Xin + 2015-02-24 18:59:23 -0800 + Commit: fba11c2, github.com/apache/spark/pull/4752 + + [SPARK-5904][SQL] DataFrame Java API test suites. + Reynold Xin + 2015-02-24 18:51:41 -0800 + Commit: 53a1ebf, github.com/apache/spark/pull/4751 + + [SPARK-5751] [SQL] [WIP] Revamped HiveThriftServer2Suite for robustness + Cheng Lian + 2015-02-25 08:34:55 +0800 + Commit: f816e73, github.com/apache/spark/pull/4720 + + [SPARK-5436] [MLlib] Validate GradientBoostedTrees using runWithValidation + MechCoder + 2015-02-24 15:13:22 -0800 + Commit: 2a0fe34, github.com/apache/spark/pull/4677 + + [SPARK-5973] [PySpark] fix zip with two RDDs with AutoBatchedSerializer + Davies Liu + 2015-02-24 14:50:00 -0800 + Commit: da505e5, github.com/apache/spark/pull/4745 + + [SPARK-5952][SQL] Lock when using hive metastore client + Michael Armbrust + 2015-02-24 13:39:29 -0800 + Commit: a2b9137, github.com/apache/spark/pull/4746 + + [Spark-5708] Add Slf4jSink to Spark Metrics + Judy , judynash + 2015-02-24 20:50:16 +0000 + Commit: c5ba975, github.com/apache/spark/pull/4644 + + [MLLIB] Change x_i to y_i in Variance's user guide + Xiangrui Meng + 2015-02-24 11:38:59 -0800 + Commit: 105791e, github.com/apache/spark/pull/4740 + + [SPARK-5965] Standalone Worker UI displays {{USER_JAR}} + Andrew Or + 2015-02-24 11:08:07 -0800 + Commit: 6d2caa5, github.com/apache/spark/pull/4739 + + [Spark-5967] [UI] Correctly clean JobProgressListener.stageIdToActiveJobIds + Tathagata Das + 2015-02-24 11:02:47 -0800 + Commit: 64d2c01, github.com/apache/spark/pull/4741 + + [SPARK-5532][SQL] Repartition should not use external rdd representation + Michael Armbrust + 2015-02-24 10:52:18 -0800 + Commit: 2012366, github.com/apache/spark/pull/4738 + + [SPARK-5910][SQL] Support for as in selectExpr + Michael Armbrust + 2015-02-24 10:49:51 -0800 + Commit: 0a59e45, github.com/apache/spark/pull/4736 + + [SPARK-5968] [SQL] Suppresses ParquetOutputCommitter WARN logs + Cheng Lian + 2015-02-24 10:45:38 -0800 + Commit: 8403331, github.com/apache/spark/pull/4744 + + [SPARK-5958][MLLIB][DOC] update block matrix user guide + Xiangrui Meng + 2015-02-23 22:08:44 -0800 + Commit: cf2e416, github.com/apache/spark/pull/4737 + + [SPARK-5873][SQL] Allow viewing of partially analyzed plans in queryExecution + Michael Armbrust + 2015-02-23 17:34:54 -0800 + Commit: 1ed5708, github.com/apache/spark/pull/4684 + + [SPARK-5935][SQL] Accept MapType in the schema provided to a JSON dataset. + Yin Huai , Yin Huai + 2015-02-23 17:16:34 -0800 + Commit: 48376bf, github.com/apache/spark/pull/4710 + + [SPARK-5912] [docs] [mllib] Small fixes to ChiSqSelector docs + Joseph K. Bradley + 2015-02-23 16:15:57 -0800 + Commit: 59536cc, github.com/apache/spark/pull/4732 + + [MLLIB] SPARK-5912 Programming guide for feature selection + Alexander Ulanov + 2015-02-23 12:09:40 -0800 + Commit: 28ccf5e, github.com/apache/spark/pull/4709 + + [SPARK-5939][MLLib] make FPGrowth example app take parameters + Jacky Li + 2015-02-23 08:47:28 -0800 + Commit: 651a1c0, github.com/apache/spark/pull/4714 + + [SPARK-5724] fix the misconfiguration in AkkaUtils + CodingCat + 2015-02-23 11:29:25 +0000 + Commit: 242d495, github.com/apache/spark/pull/4512 + + [SPARK-5943][Streaming] Update the test to use new API to reduce the warning + Saisai Shao + 2015-02-23 11:27:27 +0000 + Commit: 757b14b, github.com/apache/spark/pull/4722 + + [EXAMPLES] fix typo. + Makoto Fukuhara + 2015-02-23 09:24:33 +0000 + Commit: 9348767, github.com/apache/spark/pull/4724 + + [SPARK-3885] Provide mechanism to remove accumulators once they are no longer used + Ilya Ganelin + 2015-02-22 22:43:04 -0800 + Commit: 95cd643, github.com/apache/spark/pull/4021 + + [SPARK-911] allow efficient queries for a range if RDD is partitioned wi... + Aaron Josephs + 2015-02-22 22:09:06 -0800 + Commit: e4f9d03, github.com/apache/spark/pull/1381 + + [DataFrame] [Typo] Fix the typo + Cheng Hao + 2015-02-22 08:56:30 +0000 + Commit: 275b1be, github.com/apache/spark/pull/4717 + + [DOCS] Fix typo in API for custom InputFormats based on the “new” MapReduce API + Alexander + 2015-02-22 08:53:05 +0000 + Commit: a7f9039, github.com/apache/spark/pull/4718 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-02-21 23:07:30 -0800 + Commit: 46462ff, github.com/apache/spark/pull/3490 + + [SPARK-5860][CORE] JdbcRDD: overflow on large range with high number of partitions + Evan Yu + 2015-02-21 20:40:21 +0000 + Commit: 7683982, github.com/apache/spark/pull/4701 + + [SPARK-5937][YARN] Fix ClientSuite to set YARN mode, so that the correct class is used in t... + Hari Shreedharan + 2015-02-21 10:01:01 -0800 + Commit: 7138816, github.com/apache/spark/pull/4711 + + SPARK-5841 [CORE] [HOTFIX 2] Memory leak in DiskBlockManager + Nishkam Ravi , nishkamravi2 , nravi + 2015-02-21 09:59:28 -0800 + Commit: d3cbd38, github.com/apache/spark/pull/4690 + + [MLlib] fix typo + Jacky Li + 2015-02-21 13:00:16 +0000 + Commit: e155324, github.com/apache/spark/pull/4713 + + [SPARK-5898] [SPARK-5896] [SQL] [PySpark] create DataFrame from pandas and tuple/list + Davies Liu + 2015-02-20 15:35:05 -0800 + Commit: 5b0a42c, github.com/apache/spark/pull/4679 + + [SPARK-5867] [SPARK-5892] [doc] [ml] [mllib] Doc cleanups for 1.3 release + Joseph K. Bradley + 2015-02-20 02:31:32 -0800 + Commit: 4a17eed, github.com/apache/spark/pull/4675 + + SPARK-5744 [CORE] Take 2. RDD.isEmpty / take fails for (empty) RDD of Nothing + Sean Owen + 2015-02-20 10:21:39 +0000 + Commit: d3dfebe, github.com/apache/spark/pull/4698 + + [SPARK-5909][SQL] Add a clearCache command to Spark SQL's cache manager + Yin Huai + 2015-02-20 16:20:02 +0800 + Commit: 70bfb5c, github.com/apache/spark/pull/4694 + + [SPARK-4808] Removing minimum number of elements read before spill check + mcheah + 2015-02-19 18:09:22 -0800 + Commit: 3be92cd, github.com/apache/spark/pull/4420 + + [SPARK-5900][MLLIB] make PIC and FPGrowth Java-friendly + Xiangrui Meng + 2015-02-19 18:06:16 -0800 + Commit: 0cfd2ce, github.com/apache/spark/pull/4695 + + SPARK-5570: No docs stating that `new SparkConf().set("spark.driver.memory", ...) will not work + Ilya Ganelin + 2015-02-19 15:50:58 -0800 + Commit: 6bddc40, github.com/apache/spark/pull/4665 + + SPARK-4682 [CORE] Consolidate various 'Clock' classes + Sean Owen + 2015-02-19 15:35:23 -0800 + Commit: 34b7c35, github.com/apache/spark/pull/4514 + + [Spark-5889] Remove pid file after stopping service. + Zhan Zhang + 2015-02-19 23:13:02 +0000 + Commit: ad6b169, github.com/apache/spark/pull/4676 + + [SPARK-5902] [ml] Made PipelineStage.transformSchema public instead of private to ml + Joseph K. Bradley + 2015-02-19 12:46:27 -0800 + Commit: a5fed34, github.com/apache/spark/pull/4682 + + [SPARK-5904][SQL] DataFrame API fixes. + Reynold Xin + 2015-02-19 12:09:44 -0800 + Commit: 8ca3418, github.com/apache/spark/pull/4686 + + [SPARK-5825] [Spark Submit] Remove the double checking instance name when stopping the service + Cheng Hao + 2015-02-19 12:07:51 -0800 + Commit: 94cdb05, github.com/apache/spark/pull/4611 + + [SPARK-5423][Core] Cleanup resources in DiskMapIterator.finalize to ensure deleting the temp file + zsxwing + 2015-02-19 18:37:31 +0000 + Commit: 90095bf, github.com/apache/spark/pull/4219 + + [SPARK-5816] Add huge compatibility warning in DriverWrapper + Andrew Or + 2015-02-19 09:56:25 -0800 + Commit: 38e624a, github.com/apache/spark/pull/4687 + + SPARK-5548: Fix for AkkaUtilsSuite failure - attempt 2 + Jacek Lewandowski + 2015-02-19 09:53:36 -0800 + Commit: fb87f44, github.com/apache/spark/pull/4653 + + [SPARK-5846] Correctly set job description and pool for SQL jobs + Kay Ousterhout + 2015-02-19 09:49:34 +0800 + Commit: e945aa6, github.com/apache/spark/pull/4630 + + [SPARK-5879][MLLIB] update PIC user guide and add a Java example + Xiangrui Meng + 2015-02-18 16:29:32 -0800 + Commit: d12d2ad, github.com/apache/spark/pull/4680 + + [SPARK-5722] [SQL] [PySpark] infer int as LongType + Davies Liu + 2015-02-18 14:17:04 -0800 + Commit: aa8f10e, github.com/apache/spark/pull/4666 + + [SPARK-5840][SQL] HiveContext cannot be serialized due to tuple extraction + Reynold Xin + 2015-02-18 14:02:32 -0800 + Commit: f0e3b71, github.com/apache/spark/pull/4628 + + [SPARK-5507] Added documentation for BlockMatrix + Burak Yavuz + 2015-02-18 10:11:08 -0800 + Commit: a8eb92d, github.com/apache/spark/pull/4664 + + [SPARK-5519][MLLIB] add user guide with example code for fp-growth + Xiangrui Meng + 2015-02-18 10:09:56 -0800 + Commit: 85e9d09, github.com/apache/spark/pull/4661 + + SPARK-5669 [BUILD] [HOTFIX] Spark assembly includes incompatibly licensed libgfortran, libgcc code via JBLAS + Sean Owen + 2015-02-18 14:41:44 +0000 + Commit: 5aecdcf, github.com/apache/spark/pull/4673 + + [SPARK-4949]shutdownCallback in SparkDeploySchedulerBackend should be enclosed by synchronized block. + Kousuke Saruta + 2015-02-18 12:20:11 +0000 + Commit: 82197ed, github.com/apache/spark/pull/3781 + + SPARK-4610 addendum: [Minor] [MLlib] Minor doc fix in GBT classification example + MechCoder + 2015-02-18 10:13:28 +0000 + Commit: e79a7a6, github.com/apache/spark/pull/4672 + + [SPARK-5878] fix DataFrame.repartition() in Python + Davies Liu + 2015-02-18 01:00:54 -0800 + Commit: c1b6fa9, github.com/apache/spark/pull/4667 + + Avoid deprecation warnings in JDBCSuite. + Tor Myklebust + 2015-02-18 01:00:13 -0800 + Commit: de0dd6d, github.com/apache/spark/pull/4668 + + [Minor] [SQL] Cleans up DataFrame variable names and toDF() calls + Cheng Lian + 2015-02-17 23:36:20 -0800 + Commit: 61ab085, github.com/apache/spark/pull/4670 + + [SPARK-5731][Streaming][Test] Fix incorrect test in DirectKafkaStreamSuite + Tathagata Das + 2015-02-17 22:44:16 -0800 + Commit: 3912d33, github.com/apache/spark/pull/4597 + + [SPARK-5723][SQL]Change the default file format to Parquet for CTAS statements. + Yin Huai + 2015-02-17 18:14:33 -0800 + Commit: e50934f, github.com/apache/spark/pull/4639 + + [SPARK-5875][SQL]logical.Project should not be resolved if it contains aggregates or generators + Yin Huai + 2015-02-17 17:50:39 -0800 + Commit: d5f12bf, github.com/apache/spark/pull/4663 + + [SPARK-4454] Revert getOrElse() cleanup in DAGScheduler.getCacheLocs() + Josh Rosen + 2015-02-17 17:45:16 -0800 + Commit: a51fc7e + + [SPARK-4454] Properly synchronize accesses to DAGScheduler cacheLocs map + Josh Rosen + 2015-02-17 17:39:58 -0800 + Commit: d46d624, github.com/apache/spark/pull/4660 + + [SPARK-5811] Added documentation for maven coordinates and added Spark Packages support + Burak Yavuz , Davies Liu + 2015-02-17 17:15:43 -0800 + Commit: ae6cfb3, github.com/apache/spark/pull/4662 + + [SPARK-5785] [PySpark] narrow dependency for cogroup/join in PySpark + Davies Liu + 2015-02-17 16:54:57 -0800 + Commit: c3d2b90, github.com/apache/spark/pull/4629 + + [SPARK-5852][SQL]Fail to convert a newly created empty metastore parquet table to a data source parquet table. + Yin Huai , Cheng Hao + 2015-02-17 15:47:59 -0800 + Commit: 117121a, github.com/apache/spark/pull/4655 + + [SPARK-5872] [SQL] create a sqlCtx in pyspark shell + Davies Liu + 2015-02-17 15:44:37 -0800 + Commit: 4d4cc76, github.com/apache/spark/pull/4659 + + [SPARK-5871] output explain in Python + Davies Liu + 2015-02-17 13:48:38 -0800 + Commit: 3df85dc, github.com/apache/spark/pull/4658 + + [SPARK-4172] [PySpark] Progress API in Python + Davies Liu + 2015-02-17 13:36:43 -0800 + Commit: 445a755, github.com/apache/spark/pull/3027 + + [SPARK-5868][SQL] Fix python UDFs in HiveContext and checks in SQLContext + Michael Armbrust + 2015-02-17 13:23:45 -0800 + Commit: de4836f, github.com/apache/spark/pull/4657 + + [SQL] [Minor] Update the HiveContext Unittest + Cheng Hao + 2015-02-17 12:25:35 -0800 + Commit: 9d281fa, github.com/apache/spark/pull/4584 + + [Minor][SQL] Use same function to check path parameter in JSONRelation + Liang-Chi Hsieh + 2015-02-17 12:24:13 -0800 + Commit: ac506b7, github.com/apache/spark/pull/4649 + + [SPARK-5862][SQL] Only transformUp the given plan once in HiveMetastoreCatalog + Liang-Chi Hsieh + 2015-02-17 12:23:18 -0800 + Commit: 4611de1, github.com/apache/spark/pull/4651 + + [Minor] fix typo in SQL document + CodingCat + 2015-02-17 12:16:52 -0800 + Commit: 31efb39, github.com/apache/spark/pull/4656 + + [SPARK-5864] [PySpark] support .jar as python package + Davies Liu + 2015-02-17 12:05:06 -0800 + Commit: fc4eb95, github.com/apache/spark/pull/4652 + + SPARK-5841 [CORE] [HOTFIX] Memory leak in DiskBlockManager + Sean Owen + 2015-02-17 19:40:06 +0000 + Commit: 49c19fd, github.com/apache/spark/pull/4648 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-02-17 11:35:26 -0800 + Commit: 24f358b, github.com/apache/spark/pull/3297 + + [SPARK-3381] [MLlib] Eliminate bins for unordered features in DecisionTrees + MechCoder + 2015-02-17 11:19:23 -0800 + Commit: 9b746f3, github.com/apache/spark/pull/4231 + + [SPARK-5661]function hasShutdownDeleteTachyonDir should use shutdownDeleteTachyonPaths to determine whether contains file + xukun 00228947 , viper-kun + 2015-02-17 18:59:41 +0000 + Commit: b271c26, github.com/apache/spark/pull/4418 + + [SPARK-5778] throw if nonexistent metrics config file provided + Ryan Williams + 2015-02-17 10:57:16 -0800 + Commit: d8f69cf, github.com/apache/spark/pull/4571 + + [SPARK-5859] [PySpark] [SQL] fix DataFrame Python API + Davies Liu + 2015-02-17 10:22:48 -0800 + Commit: d8adefe, github.com/apache/spark/pull/4645 + + [SPARK-5166][SPARK-5247][SPARK-5258][SQL] API Cleanup / Documentation + Michael Armbrust + 2015-02-17 10:21:17 -0800 + Commit: c74b07f, github.com/apache/spark/pull/4642 + + [SPARK-5858][MLLIB] Remove unnecessary first() call in GLM + Xiangrui Meng + 2015-02-17 10:17:45 -0800 + Commit: c76da36, github.com/apache/spark/pull/4647 + + SPARK-5856: In Maven build script, launch Zinc with more memory + Patrick Wendell + 2015-02-17 10:10:01 -0800 + Commit: 3ce46e9, github.com/apache/spark/pull/4643 + + Revert "[SPARK-5363] [PySpark] check ending mark in non-block way" + Josh Rosen + 2015-02-17 07:48:27 -0800 + Commit: ee6e3ef + + [SPARK-5826][Streaming] Fix Configuration not serializable problem + jerryshao + 2015-02-17 10:45:18 +0000 + Commit: a65766b, github.com/apache/spark/pull/4612 + + HOTFIX: Style issue causing build break + Patrick Wendell + 2015-02-16 22:10:39 -0800 + Commit: c06e42f + + [SPARK-5802][MLLIB] cache transformed data in glm + Xiangrui Meng + 2015-02-16 22:09:04 -0800 + Commit: fd84229, github.com/apache/spark/pull/4593 + + [SPARK-5853][SQL] Schema support in Row. + Reynold Xin + 2015-02-16 20:42:57 -0800 + Commit: d380f32, github.com/apache/spark/pull/4640 + + SPARK-5850: Remove experimental label for Scala 2.11 and FlumePollingStream + Patrick Wendell + 2015-02-16 20:33:33 -0800 + Commit: a51d51f, github.com/apache/spark/pull/4638 + + [SPARK-5363] [PySpark] check ending mark in non-block way + Davies Liu + 2015-02-16 20:32:03 -0800 + Commit: ac6fe67, github.com/apache/spark/pull/4601 + + [SQL] Various DataFrame doc changes. + Reynold Xin + 2015-02-16 19:00:30 -0800 + Commit: 0e180bf, github.com/apache/spark/pull/4636 + + [SPARK-5849] Handle more types of invalid JSON requests in SubmitRestProtocolMessage.parseAction + Josh Rosen + 2015-02-16 18:08:02 -0800 + Commit: 58a82a7, github.com/apache/spark/pull/4637 + + [SPARK-3340] Deprecate ADD_JARS and ADD_FILES + azagrebin + 2015-02-16 18:06:19 -0800 + Commit: 1668765, github.com/apache/spark/pull/4616 + + [SPARK-5788] [PySpark] capture the exception in python write thread + Davies Liu + 2015-02-16 17:57:14 -0800 + Commit: b1bd1dd, github.com/apache/spark/pull/4577 + + SPARK-5848: tear down the ConsoleProgressBar timer + Matt Whelan + 2015-02-17 00:59:49 +0000 + Commit: 1294a6e, github.com/apache/spark/pull/4635 + + [SPARK-4865][SQL]Include temporary tables in SHOW TABLES + Yin Huai + 2015-02-16 15:59:23 -0800 + Commit: e189cbb, github.com/apache/spark/pull/4618 + + [SQL] Optimize arithmetic and predicate operators + kai + 2015-02-16 15:58:05 -0800 + Commit: cb6c48c, github.com/apache/spark/pull/4472 + + [SPARK-5839][SQL]HiveMetastoreCatalog does not recognize table names and aliases of data source tables. + Yin Huai + 2015-02-16 15:54:01 -0800 + Commit: f3ff1eb, github.com/apache/spark/pull/4626 + + [SPARK-5746][SQL] Check invalid cases for the write path of data source API + Yin Huai + 2015-02-16 15:51:59 -0800 + Commit: 5b6cd65, github.com/apache/spark/pull/4617 + + HOTFIX: Break in Jekyll build from #4589 + Patrick Wendell + 2015-02-16 15:43:56 -0800 + Commit: 04b401d + + [SPARK-2313] Use socket to communicate GatewayServer port back to Python driver + Josh Rosen + 2015-02-16 15:25:11 -0800 + Commit: 0cfda84, github.com/apache/spark/pull/3424. + + SPARK-5357: Update commons-codec version to 1.10 (current) + Matt Whelan + 2015-02-16 23:05:34 +0000 + Commit: c01c4eb, github.com/apache/spark/pull/4153 + + SPARK-5841: remove DiskBlockManager shutdown hook on stop + Matt Whelan + 2015-02-16 22:54:32 +0000 + Commit: bb05982, github.com/apache/spark/pull/4627 + + [SPARK-5833] [SQL] Adds REFRESH TABLE command + Cheng Lian + 2015-02-16 12:52:05 -0800 + Commit: c51ab37, github.com/apache/spark/pull/4624 + + [SPARK-5296] [SQL] Add more filter types for data sources API + Cheng Lian + 2015-02-16 12:48:55 -0800 + Commit: 6f54dee, github.com/apache/spark/pull/4623 + + [SQL] Add fetched row count in SparkSQLCLIDriver + OopsOutOfMemory + 2015-02-16 12:34:09 -0800 + Commit: b4d7c70, github.com/apache/spark/pull/4604 + + [SQL] Initial support for reporting location of error in sql string + Michael Armbrust + 2015-02-16 12:32:56 -0800 + Commit: 104b2c4, github.com/apache/spark/pull/4587 + + [SPARK-5824] [SQL] add null format in ctas and set default col comment to null + Daoyuan Wang + 2015-02-16 12:31:36 -0800 + Commit: 275a0c0, github.com/apache/spark/pull/4609 + + [SQL] [Minor] Update the SpecificMutableRow.copy + Cheng Hao + 2015-02-16 12:21:08 -0800 + Commit: cc552e0, github.com/apache/spark/pull/4619 + + SPARK-5795 [STREAMING] api.java.JavaPairDStream.saveAsNewAPIHadoopFiles may not friendly to java + Sean Owen + 2015-02-16 19:32:31 +0000 + Commit: 8e25373, github.com/apache/spark/pull/4608 + + Minor fixes for commit https://github.com/apache/spark/pull/4592. + Reynold Xin + 2015-02-16 10:09:55 -0800 + Commit: 9baac56 + + [SPARK-5799][SQL] Compute aggregation function on specified numeric columns + Liang-Chi Hsieh + 2015-02-16 10:06:11 -0800 + Commit: 5c78be7, github.com/apache/spark/pull/4592 + + SPARK-5815 [MLLIB] Part 2. Deprecate SVDPlusPlus APIs that expose DoubleMatrix from JBLAS + Sean Owen + 2015-02-16 17:04:30 +0000 + Commit: a3afa4a, github.com/apache/spark/pull/4625 + + [SPARK-5831][Streaming]When checkpoint file size is bigger than 10, then delete the old ones + Xutingjun <1039320815@qq.com> + 2015-02-16 14:54:23 +0000 + Commit: 1115e8e, github.com/apache/spark/pull/4621 + + [SPARK-4553] [SPARK-5767] [SQL] Wires Parquet data source with the newly introduced write support for data source API + Cheng Lian + 2015-02-16 01:38:31 -0800 + Commit: 3ce58cf, github.com/apache/spark/pull/4563 + + [Minor] [SQL] Renames stringRddToDataFrame to stringRddToDataFrameHolder for consistency + Cheng Lian + 2015-02-16 01:33:37 -0800 + Commit: 199a9e8, github.com/apache/spark/pull/4613 + + [Ml] SPARK-5804 Explicitly manage cache in Crossvalidator k-fold loop + Peter Rudenko + 2015-02-16 00:07:23 -0800 + Commit: d51d6ba, github.com/apache/spark/pull/4595 + + [Ml] SPARK-5796 Don't transform data on a last estimator in Pipeline + Peter Rudenko + 2015-02-15 20:51:32 -0800 + Commit: c78a12c, github.com/apache/spark/pull/4590 + + SPARK-5815 [MLLIB] Deprecate SVDPlusPlus APIs that expose DoubleMatrix from JBLAS + Sean Owen + 2015-02-15 20:41:27 -0800 + Commit: acf2558, github.com/apache/spark/pull/4614 + + [SPARK-5769] Set params in constructors and in setParams in Python ML pipelines + Xiangrui Meng + 2015-02-15 20:29:26 -0800 + Commit: cd4a153, github.com/apache/spark/pull/4564 + + SPARK-5669 [BUILD] Spark assembly includes incompatibly licensed libgfortran, libgcc code via JBLAS + Sean Owen + 2015-02-15 09:15:48 -0800 + Commit: 836577b, github.com/apache/spark/pull/4453 + + [MLLIB][SPARK-5502] User guide for isotonic regression + martinzapletal + 2015-02-15 09:10:03 -0800 + Commit: 61eb126, github.com/apache/spark/pull/4536 + + [SPARK-5827][SQL] Add missing import in the example of SqlContext + Takeshi Yamamuro + 2015-02-15 14:42:20 +0000 + Commit: c771e47, github.com/apache/spark/pull/4615 + + SPARK-5822 [BUILD] cannot import src/main/scala & src/test/scala into eclipse as source folder + gli + 2015-02-14 20:43:27 +0000 + Commit: ed5f4bb, github.com/apache/spark/pull/4531 + + Revise formatting of previous commit f80e2629bb74bc62960c61ff313f7e7802d61319 + Sean Owen + 2015-02-14 20:12:29 +0000 + Commit: 15a2ab5 + + [SPARK-5800] Streaming Docs. Change linked files according the selected language + gasparms + 2015-02-14 20:10:29 +0000 + Commit: f80e262, github.com/apache/spark/pull/4589 + + [SPARK-5752][SQL] Don't implicitly convert RDDs directly to DataFrames + Reynold Xin , Davies Liu + 2015-02-13 23:03:22 -0800 + Commit: e98dfe6, github.com/apache/spark/pull/4556 + + SPARK-3290 [GRAPHX] No unpersist callls in SVDPlusPlus + Sean Owen + 2015-02-13 20:12:52 -0800 + Commit: 0ce4e43, github.com/apache/spark/pull/4234 + + [SPARK-5227] [SPARK-5679] Disable FileSystem cache in WholeTextFileRecordReaderSuite + Josh Rosen + 2015-02-13 17:45:31 -0800 + Commit: d06d5ee, github.com/apache/spark/pull/4599 + + [SPARK-5730][ML] add doc groups to spark.ml components + Xiangrui Meng + 2015-02-13 16:45:59 -0800 + Commit: 4f4c6d5, github.com/apache/spark/pull/4600 + + [SPARK-5803][MLLIB] use ArrayBuilder to build primitive arrays + Xiangrui Meng + 2015-02-13 16:43:49 -0800 + Commit: d50a91d, github.com/apache/spark/pull/4594 + + [SPARK-5806] re-organize sections in mllib-clustering.md + Xiangrui Meng + 2015-02-13 15:09:27 -0800 + Commit: cc56c87, github.com/apache/spark/pull/4598 + + [SPARK-5789][SQL]Throw a better error message if JsonRDD.parseJson encounters unrecoverable parsing errors. + Yin Huai + 2015-02-13 13:51:06 -0800 + Commit: 2e0c084, github.com/apache/spark/pull/4582 + + [SPARK-5642] [SQL] Apply column pruning on unused aggregation fields + Daoyuan Wang , Michael Armbrust + 2015-02-13 13:46:50 -0800 + Commit: 2cbb3e4, github.com/apache/spark/pull/4415 + + [HOTFIX] Fix build break in MesosSchedulerBackendSuite + Andrew Or + 2015-02-13 13:10:29 -0800 + Commit: 5d3cc6b + + [HOTFIX] Ignore DirectKafkaStreamSuite. + Reynold Xin + 2015-02-13 12:43:53 -0800 + Commit: 378c7eb + + SPARK-5805 Fixed the type error in documentation. + Emre Sevinç + 2015-02-13 12:31:27 -0800 + Commit: 9f31db0, github.com/apache/spark/pull/4596 + + [SPARK-5735] Replace uses of EasyMock with Mockito + Josh Rosen + 2015-02-13 09:53:57 -0800 + Commit: 077eec2, github.com/apache/spark/pull/4578 + + [SPARK-5783] Better eventlog-parsing error messages + Ryan Williams + 2015-02-13 09:47:26 -0800 + Commit: fc6d3e7, github.com/apache/spark/pull/4573 + + [SPARK-5503][MLLIB] Example code for Power Iteration Clustering + sboeschhuawei + 2015-02-13 09:45:57 -0800 + Commit: e1a1ff8, github.com/apache/spark/pull/4495 + + [SPARK-5732][CORE]:Add an option to print the spark version in spark script. + uncleGen , genmao.ygm + 2015-02-13 09:43:10 -0800 + Commit: c0ccd25, github.com/apache/spark/pull/4522 + + [SPARK-4832][Deploy]some other processes might take the daemon pid + WangTaoTheTonic , WangTaoTheTonic + 2015-02-13 10:27:23 +0000 + Commit: 1768bd5, github.com/apache/spark/pull/3683 + + [SPARK-3365][SQL]Wrong schema generated for List type + tianyi + 2015-02-12 22:18:39 -0800 + Commit: 1c8633f, github.com/apache/spark/pull/4581 + + [SQL] Fix docs of SQLContext.tables + Yin Huai + 2015-02-12 20:37:55 -0800 + Commit: 2aea892, github.com/apache/spark/pull/4579 + + [SPARK-3299][SQL]Public API in SQLContext to list tables + Yin Huai + 2015-02-12 18:08:01 -0800 + Commit: 1d0596a, github.com/apache/spark/pull/4547 + + [SQL] Move SaveMode to SQL package. + Yin Huai + 2015-02-12 15:32:17 -0800 + Commit: c025a46, github.com/apache/spark/pull/4542 + + [SPARK-5335] Fix deletion of security groups within a VPC + Vladimir Grigor , Vladimir Grigor + 2015-02-12 23:26:24 +0000 + Commit: ada993e, github.com/apache/spark/pull/4122 + + [SPARK-5755] [SQL] remove unnecessary Add + Daoyuan Wang + 2015-02-12 15:22:07 -0800 + Commit: d5fc514, github.com/apache/spark/pull/4551 + + [SPARK-5573][SQL] Add explode to dataframes + Michael Armbrust + 2015-02-12 15:19:19 -0800 + Commit: ee04a8b, github.com/apache/spark/pull/4546 + + [SPARK-5758][SQL] Use LongType as the default type for integers in JSON schema inference. + Yin Huai + 2015-02-12 15:17:25 -0800 + Commit: c352ffb, github.com/apache/spark/pull/4544 + + [SPARK-5780] [PySpark] Mute the logging during unit tests + Davies Liu + 2015-02-12 14:54:38 -0800 + Commit: 0bf0315, github.com/apache/spark/pull/4572 + + SPARK-5747: Fix wordsplitting bugs in make-distribution.sh + David Y. Ross + 2015-02-12 14:52:38 -0800 + Commit: 26c816e, github.com/apache/spark/pull/4540 + + [SPARK-5759][Yarn]ExecutorRunnable should catch YarnException while NMClient start contain... + lianhuiwang + 2015-02-12 14:50:16 -0800 + Commit: 947b8bd, github.com/apache/spark/pull/4554 + + [SPARK-5760][SPARK-5761] Fix standalone rest protocol corner cases + revamp tests + Andrew Or + 2015-02-12 14:47:52 -0800 + Commit: 1d5663e, github.com/apache/spark/pull/4557 + + [SPARK-5762] Fix shuffle write time for sort-based shuffle + Kay Ousterhout + 2015-02-12 14:46:37 -0800 + Commit: 47c73d4, github.com/apache/spark/pull/4559 + + [SPARK-5765][Examples]Fixed word split problem in run-example and compute-classpath + Venkata Ramana G , Venkata Ramana Gollamudi + 2015-02-12 14:44:21 -0800 + Commit: 629d014, github.com/apache/spark/pull/4561 + + [EC2] Update default Spark version to 1.2.1 + Katsunori Kanda + 2015-02-12 14:38:42 -0800 + Commit: 9c80765, github.com/apache/spark/pull/4566 + + [SPARK-5645] Added local read bytes/time to task metrics + Kay Ousterhout + 2015-02-12 14:35:44 -0800 + Commit: 893d6fd, github.com/apache/spark/pull/4510 + + [SQL] Improve error messages + Michael Armbrust , wangfei + 2015-02-12 13:11:28 -0800 + Commit: aa4ca8b, github.com/apache/spark/pull/4558 + + [SQL][DOCS] Update sql documentation + Antonio Navarro Perez + 2015-02-12 12:46:17 -0800 + Commit: 6a1be02, github.com/apache/spark/pull/4560 + + SPARK-5776 JIRA version not of form x.y.z breaks merge_spark_pr.py + Sean Owen + 2015-02-12 20:14:45 +0000 + Commit: bc57789, github.com/apache/spark/pull/4570 + + [SPARK-5757][MLLIB] replace SQL JSON usage in model import/export by json4s + Xiangrui Meng + 2015-02-12 10:48:13 -0800 + Commit: 99bd500, github.com/apache/spark/pull/4555 + + [SPARK-5655] Don't chmod700 application files if running in YARN + Andrew Rowson + 2015-02-12 18:41:39 +0000 + Commit: 466b1f6, github.com/apache/spark/pull/4509 + + ignore cache paths for RAT tests + Oren Mazor + 2015-02-12 18:37:00 +0000 + Commit: 9a6efbc, github.com/apache/spark/pull/4569 + + SPARK-5727 [BUILD] Remove Debian packaging + Sean Owen + 2015-02-12 12:36:26 +0000 + Commit: 9a3ea49, github.com/apache/spark/pull/4526 + + [SQL] Make dataframe more tolerant of being serialized + Michael Armbrust + 2015-02-11 19:05:49 -0800 + Commit: a38e23c, github.com/apache/spark/pull/4545 + + [SQL] Two DataFrame fixes. + Reynold Xin + 2015-02-11 18:32:48 -0800 + Commit: d931b01, github.com/apache/spark/pull/4543 + + [SPARK-3688][SQL] More inline comments for LogicalPlan. + Reynold Xin + 2015-02-11 15:26:31 -0800 + Commit: fa6bdc6, github.com/apache/spark/pull/4539 + + [SPARK-3688][SQL]LogicalPlan can't resolve column correctlly + tianyi + 2015-02-11 12:50:17 -0800 + Commit: 44b2311, github.com/apache/spark/pull/4524 + + [SPARK-5454] More robust handling of self joins + Michael Armbrust + 2015-02-11 12:31:56 -0800 + Commit: a60d2b7, github.com/apache/spark/pull/4520 + + Remove outdated remark about take(n). + Daniel Darabos + 2015-02-11 20:24:17 +0000 + Commit: 03bf704, github.com/apache/spark/pull/4533 + + [SPARK-5677] [SPARK-5734] [SQL] [PySpark] Python DataFrame API remaining tasks + Davies Liu + 2015-02-11 12:13:16 -0800 + Commit: b694eb9, github.com/apache/spark/pull/4528 + + [SPARK-5733] Error Link in Pagination of HistroyPage when showing Incomplete Applications + guliangliang + 2015-02-11 15:55:49 +0000 + Commit: 1ac099e, github.com/apache/spark/pull/4523 + + SPARK-5727 [BUILD] Deprecate Debian packaging + Sean Owen + 2015-02-11 08:30:16 +0000 + Commit: bd0d6e0, github.com/apache/spark/pull/4516 + + SPARK-5728 [STREAMING] MQTTStreamSuite leaves behind ActiveMQ database files + Sean Owen + 2015-02-11 08:13:51 +0000 + Commit: da89720, github.com/apache/spark/pull/4517 + + [SPARK-4964] [Streaming] refactor createRDD to take leaders via map instead of array + cody koeninger + 2015-02-11 00:13:27 -0800 + Commit: 658687b, github.com/apache/spark/pull/4511 + + HOTFIX: Adding Junit to Hive tests for Maven build + Patrick Wendell + 2015-02-10 23:39:21 -0800 + Commit: c2131c0 + + HOTFIX: Java 6 compilation error in Spark SQL + Patrick Wendell + 2015-02-10 22:43:32 -0800 + Commit: 7e2f882 + + [SPARK-5714][Mllib] Refactor initial step of LDA to remove redundant operations + Liang-Chi Hsieh + 2015-02-10 21:51:15 -0800 + Commit: f86a89a, github.com/apache/spark/pull/4501 + + [SPARK-5702][SQL] Allow short names for built-in data sources. + Reynold Xin + 2015-02-10 20:40:21 -0800 + Commit: b8f88d3, github.com/apache/spark/pull/4489 + + [SPARK-5729] Potential NPE in standalone REST API + Andrew Or + 2015-02-10 20:19:14 -0800 + Commit: b969182, github.com/apache/spark/pull/4518 + + [SPARK-4879] Use driver to coordinate Hadoop output committing for speculative tasks + mcheah , Josh Rosen + 2015-02-10 20:12:18 -0800 + Commit: 1cb3770, github.com/apache/spark/pull/4155. + + [SQL][DataFrame] Fix column computability bug. + Reynold Xin + 2015-02-10 19:50:44 -0800 + Commit: 7e24249, github.com/apache/spark/pull/4519 + + [SPARK-5709] [SQL] Add EXPLAIN support in DataFrame API for debugging purpose + Cheng Hao + 2015-02-10 19:40:51 -0800 + Commit: 45df77b, github.com/apache/spark/pull/4496 + + [SPARK-5704] [SQL] [PySpark] createDataFrame from RDD with columns + Davies Liu + 2015-02-10 19:40:12 -0800 + Commit: ea60284, github.com/apache/spark/pull/4498 + + [SPARK-5683] [SQL] Avoid multiple json generator created + Cheng Hao + 2015-02-10 18:19:56 -0800 + Commit: a60aea8, github.com/apache/spark/pull/4468 + + [SQL] Add an exception for analysis errors. + Michael Armbrust + 2015-02-10 17:32:42 -0800 + Commit: 6195e24, github.com/apache/spark/pull/4439 + + [SPARK-5658][SQL] Finalize DDL and write support APIs + Yin Huai + 2015-02-10 17:29:52 -0800 + Commit: aaf50d0, github.com/apache/spark/pull/4446 + + [SPARK-5493] [core] Add option to impersonate user. + Marcelo Vanzin + 2015-02-10 17:19:10 -0800 + Commit: ed167e7, github.com/apache/spark/pull/4405 + + [SQL] Make Options in the data source API CREATE TABLE statements optional. + Yin Huai + 2015-02-10 17:06:12 -0800 + Commit: e28b6bd, github.com/apache/spark/pull/4515 + + [SPARK-5725] [SQL] Fixes ParquetRelation2.equals + Cheng Lian + 2015-02-10 17:02:44 -0800 + Commit: 2d50a01, github.com/apache/spark/pull/4513 + + [SQL][Minor] correct some comments + Sheng, Li , OopsOutOfMemory + 2015-02-11 00:59:46 +0000 + Commit: 91e3512, github.com/apache/spark/pull/4508 + + [SPARK-5644] [Core]Delete tmp dir when sc is stop + Sephiroth-Lin + 2015-02-10 23:23:35 +0000 + Commit: 52983d7, github.com/apache/spark/pull/4412 + + [SPARK-5343][GraphX]: ShortestPaths traverses backwards + Brennon York + 2015-02-10 14:57:00 -0800 + Commit: 5820961, github.com/apache/spark/pull/4478 + + [SPARK-5021] [MLlib] Gaussian Mixture now supports Sparse Input + MechCoder + 2015-02-10 14:05:55 -0800 + Commit: fd2c032, github.com/apache/spark/pull/4459 + + [SPARK-5686][SQL] Add show current roles command in HiveQl + OopsOutOfMemory + 2015-02-10 13:20:15 -0800 + Commit: f98707c, github.com/apache/spark/pull/4471 + + [SQL] Add toString to DataFrame/Column + Michael Armbrust + 2015-02-10 13:14:01 -0800 + Commit: de80b1b, github.com/apache/spark/pull/4436 + + [SPARK-5668] Display region in spark_ec2.py get_existing_cluster() + Miguel Peralvo + 2015-02-10 19:54:52 +0000 + Commit: c49a404, github.com/apache/spark/pull/4457 + + [SPARK-5592][SQL] java.net.URISyntaxException when insert data to a partitioned table + wangfei , Fei Wang + 2015-02-10 11:54:30 -0800 + Commit: 59272da, github.com/apache/spark/pull/4368 + + [HOTFIX][SPARK-4136] Fix compilation and tests + Andrew Or + 2015-02-10 11:18:01 -0800 + Commit: b640c84 + + SPARK-4136. Under dynamic allocation, cancel outstanding executor requests when no longer needed + Sandy Ryza + 2015-02-10 11:07:25 -0800 + Commit: 69bc3bb, github.com/apache/spark/pull/4168 + + [SPARK-5716] [SQL] Support TOK_CHARSETLITERAL in HiveQl + Daoyuan Wang + 2015-02-10 11:08:21 -0800 + Commit: c7ad80a, github.com/apache/spark/pull/4502 + + [Spark-5717] [MLlib] add stop and reorganize import + JqueryFan , Yuhao Yang + 2015-02-10 17:37:32 +0000 + Commit: 6cc96cf, github.com/apache/spark/pull/4503 + + [SPARK-1805] [EC2] Validate instance types + Nicholas Chammas + 2015-02-10 15:45:38 +0000 + Commit: 50820f1, github.com/apache/spark/pull/4455 + + [SPARK-5700] [SQL] [Build] Bumps jets3t to 0.9.3 for hadoop-2.3 and hadoop-2.4 profiles + Cheng Lian + 2015-02-10 02:28:47 -0800 + Commit: ba66793, github.com/apache/spark/pull/4499 + + SPARK-5239 [CORE] JdbcRDD throws "java.lang.AbstractMethodError: oracle.jdbc.driver.xxxxxx.isClosed()Z" + Sean Owen + 2015-02-10 09:19:01 +0000 + Commit: 2d1e916, github.com/apache/spark/pull/4470 + + [SPARK-4964][Streaming][Kafka] More updates to Exactly-once Kafka stream + Tathagata Das + 2015-02-09 22:45:48 -0800 + Commit: c151346, github.com/apache/spark/pull/4384 + + [SPARK-5597][MLLIB] save/load for decision trees and emsembles + Joseph K. Bradley , Xiangrui Meng + 2015-02-09 22:09:07 -0800 + Commit: ef2f55b, github.com/apache/spark/pull/4444. + + [SQL] Remove the duplicated code + Cheng Hao + 2015-02-09 21:33:34 -0800 + Commit: bd0b5ea, github.com/apache/spark/pull/4494 + + [SPARK-5701] Only set ShuffleReadMetrics when task has shuffle deps + Kay Ousterhout + 2015-02-09 21:22:09 -0800 + Commit: a2d33d0, github.com/apache/spark/pull/4488 + + [SPARK-5703] AllJobsPage throws empty.max exception + Andrew Or + 2015-02-09 21:18:48 -0800 + Commit: a95ed52, github.com/apache/spark/pull/4490 + + [SPARK-2996] Implement userClassPathFirst for driver, yarn. + Marcelo Vanzin + 2015-02-09 21:17:06 -0800 + Commit: 20a6013, github.com/apache/spark/pull/3233 + + SPARK-4900 [MLLIB] MLlib SingularValueDecomposition ARPACK IllegalStateException + Sean Owen + 2015-02-09 21:13:58 -0800 + Commit: 36c4e1d, github.com/apache/spark/pull/4485 + + Add a config option to print DAG. + KaiXinXiaoLei + 2015-02-09 20:58:58 -0800 + Commit: 31d435e, github.com/apache/spark/pull/4257 + + [SPARK-5469] restructure pyspark.sql into multiple files + Davies Liu + 2015-02-09 20:49:22 -0800 + Commit: 08488c1, github.com/apache/spark/pull/4479 + + [SPARK-5698] Do not let user request negative # of executors + Andrew Or + 2015-02-09 17:33:29 -0800 + Commit: d302c48, github.com/apache/spark/pull/4483 + + [SPARK-5699] [SQL] [Tests] Runs hive-thriftserver tests whenever SQL code is modified + Cheng Lian + 2015-02-09 16:52:05 -0800 + Commit: 3ec3ad2, github.com/apache/spark/pull/4486 + + [SPARK-5648][SQL] support "alter ... unset tblproperties("key")" + DoingDone9 <799203320@qq.com> + 2015-02-09 16:40:26 -0800 + Commit: d08e7c2, github.com/apache/spark/pull/4424 + + [SPARK-2096][SQL] support dot notation on array of struct + Wenchen Fan + 2015-02-09 16:39:34 -0800 + Commit: 0ee53eb, github.com/apache/spark/pull/2405 + + [SPARK-5614][SQL] Predicate pushdown through Generate. + Lu Yan + 2015-02-09 16:25:38 -0800 + Commit: 2a36292, github.com/apache/spark/pull/4394 + + [SPARK-5696] [SQL] [HOTFIX] Asks HiveThriftServer2 to re-initialize log4j using Hive configurations + Cheng Lian + 2015-02-09 16:23:12 -0800 + Commit: b8080aa, github.com/apache/spark/pull/4484 + + [SQL] Code cleanup. + Yin Huai + 2015-02-09 16:20:42 -0800 + Commit: 5f0b30e, github.com/apache/spark/pull/4482 + + [SQL] Add some missing DataFrame functions. + Michael Armbrust + 2015-02-09 16:02:56 -0800 + Commit: 68b25cf, github.com/apache/spark/pull/4437 + + [SPARK-5611] [EC2] Allow spark-ec2 repo and branch to be set on CLI of spark_ec2.py + Florian Verhein + 2015-02-09 23:47:07 +0000 + Commit: b884daa, github.com/apache/spark/pull/4385 + + [SPARK-5675][SQL] XyzType companion object should subclass XyzType + Reynold Xin + 2015-02-09 14:51:46 -0800 + Commit: f48199e, github.com/apache/spark/pull/4463 + + [SPARK-4905][STREAMING] FlumeStreamSuite fix. + Hari Shreedharan + 2015-02-09 14:17:14 -0800 + Commit: 0765af9, github.com/apache/spark/pull/4371 + + [SPARK-5691] Fixing wrong data structure lookup for dupe app registratio... + mcheah + 2015-02-09 13:20:14 -0800 + Commit: 6fe70d8, github.com/apache/spark/pull/4477 + + [SPARK-5664][BUILD] Restore stty settings when exiting from SBT's spark-shell + Liang-Chi Hsieh + 2015-02-09 11:45:12 -0800 + Commit: dae2161, github.com/apache/spark/pull/4451 + + [SPARK-5678] Convert DataFrame to pandas.DataFrame and Series + Davies Liu + 2015-02-09 11:42:52 -0800 + Commit: afb1316, github.com/apache/spark/pull/4476 + + SPARK-4267 [YARN] Failing to launch jobs on Spark on YARN with Hadoop 2.5.0 or later + Sean Owen + 2015-02-09 10:33:57 -0800 + Commit: de78060, github.com/apache/spark/pull/4452 + + SPARK-2149. [MLLIB] Univariate kernel density estimation + Sandy Ryza + 2015-02-09 10:12:12 +0000 + Commit: 0793ee1, github.com/apache/spark/pull/1093 + + [SPARK-5473] [EC2] Expose SSH failures after status checks pass + Nicholas Chammas + 2015-02-09 09:44:53 +0000 + Commit: 4dfe180, github.com/apache/spark/pull/4262 + + [SPARK-5539][MLLIB] LDA guide + Xiangrui Meng , Joseph K. Bradley + 2015-02-08 23:40:36 -0800 + Commit: 855d12a, github.com/apache/spark/pull/4465 + + [SPARK-5472][SQL] Fix Scala code style + Hung Lin + 2015-02-08 22:36:42 -0800 + Commit: 4575c56, github.com/apache/spark/pull/4464 + + SPARK-4405 [MLLIB] Matrices.* construction methods should check for rows x cols overflow + Sean Owen + 2015-02-08 21:08:50 -0800 + Commit: 4396dfb, github.com/apache/spark/pull/4461 + + [SPARK-5660][MLLIB] Make Matrix apply public + Joseph K. Bradley , Xiangrui Meng + 2015-02-08 21:07:36 -0800 + Commit: c171611, github.com/apache/spark/pull/4447 + + [SPARK-5643][SQL] Add a show method to print the content of a DataFrame in tabular format. + Reynold Xin + 2015-02-08 18:56:51 -0800 + Commit: a052ed4, github.com/apache/spark/pull/4416 + + SPARK-5665 [DOCS] Update netlib-java documentation + Sam Halliday , Sam Halliday + 2015-02-08 16:34:26 -0800 + Commit: 56aff4b, github.com/apache/spark/pull/4448 + + [SPARK-5598][MLLIB] model save/load for ALS + Xiangrui Meng + 2015-02-08 16:26:20 -0800 + Commit: 5c299c5, github.com/apache/spark/pull/4422 + + [SQL] Set sessionState in QueryExecution. + Yin Huai + 2015-02-08 14:55:07 -0800 + Commit: 804949d, github.com/apache/spark/pull/4445 + + [SPARK-3039] [BUILD] Spark assembly for new hadoop API (hadoop 2) contai... + medale + 2015-02-08 10:35:29 +0000 + Commit: 75fdccc, github.com/apache/spark/pull/4315 + + [SPARK-5672][Web UI] Don't return `ERROR 500` when have missing args + Kirill A. Korinskiy + 2015-02-08 10:31:46 +0000 + Commit: 23a99da, github.com/apache/spark/pull/4239 + + [SPARK-5656] Fail gracefully for large values of k and/or n that will ex... + mbittmann , bittmannm + 2015-02-08 10:13:29 +0000 + Commit: 4878313, github.com/apache/spark/pull/4433 + + [SPARK-5366][EC2] Check the mode of private key + liuchang0812 + 2015-02-08 10:08:51 +0000 + Commit: 6fb141e, github.com/apache/spark/pull/4162 + + [SPARK-5671] Upgrade jets3t to 0.9.2 in hadoop-2.3 and 2.4 profiles + Josh Rosen + 2015-02-07 17:19:08 -0800 + Commit: 5de14cc, github.com/apache/spark/pull/4454 + + [SPARK-5108][BUILD] Jackson dependency management for Hadoop-2.6.0 support + Zhan Zhang + 2015-02-07 19:41:30 +0000 + Commit: ecbbed2, github.com/apache/spark/pull/3938 + + SPARK-5408: Use -XX:MaxPermSize specified by user instead of default in ... + Jacek Lewandowski + 2015-02-07 15:58:04 +0000 + Commit: dd4cb33, github.com/apache/spark/pull/4203 + + [BUILD] Add the ability to launch spark-shell from SBT. + Michael Armbrust + 2015-02-07 00:14:38 -0800 + Commit: e9a4fe1, github.com/apache/spark/pull/4438 + + [SPARK-5388] Provide a stable application submission gateway for standalone cluster mode + Andrew Or + 2015-02-06 15:57:06 -0800 + Commit: 1390e56, github.com/apache/spark/pull/4216 + + SPARK-5403: Ignore UserKnownHostsFile in SSH calls + Grzegorz Dubicki + 2015-02-06 15:43:58 -0800 + Commit: e772b4e, github.com/apache/spark/pull/4196 + + [SPARK-5601][MLLIB] make streaming linear algorithms Java-friendly + Xiangrui Meng + 2015-02-06 15:42:59 -0800 + Commit: 0e23ca9, github.com/apache/spark/pull/4432 + + [SQL] [Minor] HiveParquetSuite was disabled by mistake, re-enable them + Cheng Lian + 2015-02-06 15:23:42 -0800 + Commit: c402140, github.com/apache/spark/pull/4440 + + [SQL] Use TestSQLContext in Java tests + Michael Armbrust + 2015-02-06 15:11:02 -0800 + Commit: 76c4bf5, github.com/apache/spark/pull/4441 + + [SPARK-4994][network]Cleanup removed executors' ShuffleInfo in yarn shuffle service + lianhuiwang + 2015-02-06 14:47:52 -0800 + Commit: 61073f8, github.com/apache/spark/pull/3828 + + [SPARK-5444][Network]Add a retry to deal with the conflict port in netty server. + huangzhaowei + 2015-02-06 14:35:29 -0800 + Commit: 2bda1c1, github.com/apache/spark/pull/4240 + + [SPARK-4874] [CORE] Collect record count metrics + Kostas Sakellis + 2015-02-06 14:31:20 -0800 + Commit: dcd1e42, github.com/apache/spark/pull/4067 + + [HOTFIX] Fix the maven build after adding sqlContext to spark-shell + Michael Armbrust + 2015-02-06 14:27:06 -0800 + Commit: 5796156, github.com/apache/spark/pull/4443 + + [SPARK-5600] [core] Clean up FsHistoryProvider test, fix app sort order. + Marcelo Vanzin + 2015-02-06 14:23:09 -0800 + Commit: 5687bab, github.com/apache/spark/pull/4370 + + SPARK-5613: Catch the ApplicationNotFoundException exception to avoid thread from getting killed on yarn restart. + Kashish Jain + 2015-02-06 13:47:23 -0800 + Commit: ca66159, github.com/apache/spark/pull/4392 + + SPARK-5633 pyspark saveAsTextFile support for compression codec + Vladimir Vladimirov + 2015-02-06 13:55:02 -0800 + Commit: b3872e0, github.com/apache/spark/pull/4403 + + [HOTFIX][MLLIB] fix a compilation error with java 6 + Xiangrui Meng + 2015-02-06 13:52:35 -0800 + Commit: 65181b7, github.com/apache/spark/pull/4442 + + [SPARK-4983] Insert waiting time before tagging EC2 instances + GenTang , Gen TANG + 2015-02-06 13:27:34 -0800 + Commit: 0f3a360, github.com/apache/spark/pull/3986 + + [SPARK-5586][Spark Shell][SQL] Make `sqlContext` available in spark shell + OopsOutOfMemory + 2015-02-06 13:20:10 -0800 + Commit: 3d3ecd7, github.com/apache/spark/pull/4387 + + [SPARK-5278][SQL] Introduce UnresolvedGetField and complete the check of ambiguous reference to fields + Wenchen Fan + 2015-02-06 13:08:09 -0800 + Commit: 4793c84, github.com/apache/spark/pull/4068 + + [SQL][Minor] Remove cache keyword in SqlParser + wangfei + 2015-02-06 12:42:23 -0800 + Commit: bc36356, github.com/apache/spark/pull/4393 + + [SQL][HiveConsole][DOC] HiveConsole `correct hiveconsole imports` + OopsOutOfMemory + 2015-02-06 12:41:28 -0800 + Commit: b62c352, github.com/apache/spark/pull/4389 + + [SPARK-5595][SPARK-5603][SQL] Add a rule to do PreInsert type casting and field renaming and invalidating in memory cache after INSERT + Yin Huai + 2015-02-06 12:38:07 -0800 + Commit: 3eccf29, github.com/apache/spark/pull/4373 + + [SPARK-5324][SQL] Results of describe can't be queried + OopsOutOfMemory , Sheng, Li + 2015-02-06 12:33:20 -0800 + Commit: 0b7eb3f, github.com/apache/spark/pull/4249 + + [SPARK-5619][SQL] Support 'show roles' in HiveContext + q00251598 + 2015-02-06 12:29:26 -0800 + Commit: a958d60, github.com/apache/spark/pull/4397 + + [SPARK-5640] Synchronize ScalaReflection where necessary + Tobias Schlatter + 2015-02-06 12:15:02 -0800 + Commit: 500dc2b, github.com/apache/spark/pull/4431 + + [SPARK-5650][SQL] Support optional 'FROM' clause + Liang-Chi Hsieh + 2015-02-06 12:13:44 -0800 + Commit: d433816, github.com/apache/spark/pull/4426 + + [SPARK-5628] Add version option to spark-ec2 + Nicholas Chammas + 2015-02-06 12:08:22 -0800 + Commit: 70e5b03, github.com/apache/spark/pull/4414 + + [SPARK-2945][YARN][Doc]add doc for spark.executor.instances + WangTaoTheTonic + 2015-02-06 11:57:02 -0800 + Commit: d34f79c, github.com/apache/spark/pull/4350 + + [SPARK-4361][Doc] Add more docs for Hadoop Configuration + zsxwing + 2015-02-06 11:50:20 -0800 + Commit: af2a2a2, github.com/apache/spark/pull/3225 + + [HOTFIX] Fix test build break in ExecutorAllocationManagerSuite. + Josh Rosen + 2015-02-06 11:47:32 -0800 + Commit: fb6c0cb + + [SPARK-5652][Mllib] Use broadcasted weights in LogisticRegressionModel + Liang-Chi Hsieh + 2015-02-06 11:22:11 -0800 + Commit: 80f3bcb, github.com/apache/spark/pull/4429 + + [SPARK-5555] Enable UISeleniumSuite tests + Josh Rosen + 2015-02-06 11:14:58 -0800 + Commit: 0d74bd7, github.com/apache/spark/pull/4334 + + SPARK-2450 Adds executor log links to Web UI + Kostas Sakellis , Josh Rosen + 2015-02-06 11:13:00 -0800 + Commit: 32e964c, github.com/apache/spark/pull/3486 + + [SPARK-5618][Spark Core][Minor] Optimise utility code. + Makoto Fukuhara + 2015-02-06 11:11:38 -0800 + Commit: 4cdb26c, github.com/apache/spark/pull/4396 + + [SPARK-5593][Core]Replace BlockManagerListener with ExecutorListener in ExecutorAllocationListener + lianhuiwang + 2015-02-06 11:09:37 -0800 + Commit: 6072fcc, github.com/apache/spark/pull/4369 + + [SPARK-4877] Allow user first classes to extend classes in the parent. + Stephen Haberman + 2015-02-06 11:03:56 -0800 + Commit: 9792bec, github.com/apache/spark/pull/3725 + + [SPARK-5396] Syntax error in spark scripts on windows. + Masayoshi TSUZUKI + 2015-02-06 10:58:26 -0800 + Commit: c01b985, github.com/apache/spark/pull/4428 + + [SPARK-5636] Ramp up faster in dynamic allocation + Andrew Or + 2015-02-06 10:54:23 -0800 + Commit: fe3740c, github.com/apache/spark/pull/4409 + + SPARK-4337. [YARN] Add ability to cancel pending requests + Sandy Ryza + 2015-02-06 10:53:16 -0800 + Commit: 1a88f20, github.com/apache/spark/pull/4141 + + [SPARK-5653][YARN] In ApplicationMaster rename isDriver to isClusterMode + lianhuiwang + 2015-02-06 10:48:31 -0800 + Commit: cc6e531, github.com/apache/spark/pull/4430 + + [SPARK-5013] [MLlib] Added documentation and sample data file for GaussianMixture + Travis Galoppo + 2015-02-06 10:26:51 -0800 + Commit: 9ad56ad, github.com/apache/spark/pull/4401 + + [SPARK-5416] init Executor.threadPool before ExecutorSource + Ryan Williams + 2015-02-06 12:22:25 +0000 + Commit: 37d35ab, github.com/apache/spark/pull/4212 + + [Build] Set all Debian package permissions to 755 + Nicholas Chammas + 2015-02-06 11:38:39 +0000 + Commit: cf6778e, github.com/apache/spark/pull/4277 + + Update ec2-scripts.md + Miguel Peralvo + 2015-02-06 11:04:48 +0000 + Commit: f827ef4, github.com/apache/spark/pull/4300 + + [SPARK-5470][Core]use defaultClassLoader to load classes in KryoSerializer + lianhuiwang + 2015-02-06 11:00:35 +0000 + Commit: ed3aac7, github.com/apache/spark/pull/4258 + + [SPARK-5582] [history] Ignore empty log directories. + Marcelo Vanzin + 2015-02-06 10:07:20 +0000 + Commit: 8569289, github.com/apache/spark/pull/4352 + + [SPARK-5157][YARN] Configure more JVM options properly when we use ConcMarkSweepGC for AM. + Kousuke Saruta + 2015-02-06 09:39:12 +0000 + Commit: 24dbc50, github.com/apache/spark/pull/3956 + + [Minor] Remove permission for execution from spark-shell.cmd + Kousuke Saruta + 2015-02-06 09:33:36 +0000 + Commit: f6ba813, github.com/apache/spark/pull/3983 + + [SPARK-5380][GraphX] Solve an ArrayIndexOutOfBoundsException when build graph with a file format error + Leolh + 2015-02-06 09:01:53 +0000 + Commit: 575d2df, github.com/apache/spark/pull/4176 + + [SPARK-4789] [SPARK-4942] [SPARK-5031] [mllib] Standardize ML Prediction APIs + Joseph K. Bradley + 2015-02-05 23:43:47 -0800 + Commit: dc0c449, github.com/apache/spark/pull/3637 + + [SPARK-5604][MLLIB] remove checkpointDir from trees + Xiangrui Meng + 2015-02-05 23:32:09 -0800 + Commit: 6b88825, github.com/apache/spark/pull/4407 + + [SPARK-5639][SQL] Support DataFrame.renameColumn. + Reynold Xin + 2015-02-05 23:02:40 -0800 + Commit: 7dc4965, github.com/apache/spark/pull/4410 + + Revert "SPARK-5607: Update to Kryo 2.24.0 to avoid including objenesis 1.2." + Patrick Wendell + 2015-02-05 18:36:48 -0800 + Commit: 6d3b7cb + + SPARK-5557: Explicitly include servlet API in dependencies. + Patrick Wendell + 2015-02-05 18:14:54 -0800 + Commit: 793dbae, github.com/apache/spark/pull/4411 + + [HOTFIX] [SQL] Disables Metastore Parquet table conversion for "SQLQuerySuite.CTAS with serde" + Cheng Lian + 2015-02-05 18:09:18 -0800 + Commit: 7c0a648, github.com/apache/spark/pull/4413 + + [SPARK-5638][SQL] Add a config flag to disable eager analysis of DataFrames + Reynold Xin + 2015-02-05 18:07:10 -0800 + Commit: e8a5d50, github.com/apache/spark/pull/4408 + + [SPARK-5620][DOC] group methods in generated unidoc + Xiangrui Meng + 2015-02-05 16:26:51 -0800 + Commit: 85ccee8, github.com/apache/spark/pull/4404 + + [SPARK-5182] [SPARK-5528] [SPARK-5509] [SPARK-3575] [SQL] Parquet data source improvements + Cheng Lian + 2015-02-05 15:29:56 -0800 + Commit: a9ed511, github.com/apache/spark/pull/4308 + + [SPARK-5604[MLLIB] remove checkpointDir from LDA + Xiangrui Meng + 2015-02-05 15:07:33 -0800 + Commit: c19152c, github.com/apache/spark/pull/4390 + + [SPARK-5460][MLlib] Wrapped `Try` around `deleteAllCheckpoints` - RandomForest. + x1- + 2015-02-05 15:02:04 -0800 + Commit: 62371ad, github.com/apache/spark/pull/4347 + + [SPARK-5135][SQL] Add support for describe table to DDL in SQLContext + OopsOutOfMemory + 2015-02-05 13:07:48 -0800 + Commit: 4d8d070, github.com/apache/spark/pull/4227 + + [SPARK-5617][SQL] fix test failure of SQLQuerySuite + wangfei + 2015-02-05 12:44:12 -0800 + Commit: a83936e, github.com/apache/spark/pull/4395 + + [Branch-1.3] [DOC] doc fix for date + Daoyuan Wang + 2015-02-05 12:42:27 -0800 + Commit: 6fa4ac1, github.com/apache/spark/pull/4400 + + SPARK-5548: Fixed a race condition in AkkaUtilsSuite + Jacek Lewandowski + 2015-02-05 12:00:04 -0800 + Commit: 081ac69, github.com/apache/spark/pull/4343 + + [SPARK-5474][Build]curl should support URL redirection in build/mvn + GuoQiang Li + 2015-02-05 12:03:13 -0800 + Commit: 3414754, github.com/apache/spark/pull/4263 + + [SPARK-5608] Improve SEO of Spark documentation pages + Matei Zaharia + 2015-02-05 11:12:50 -0800 + Commit: 4d74f06, github.com/apache/spark/pull/4381 + + SPARK-4687. Add a recursive option to the addFile API + Sandy Ryza + 2015-02-05 10:15:55 -0800 + Commit: c4b1108, github.com/apache/spark/pull/3670 + + [HOTFIX] MLlib build break. + Reynold Xin + 2015-02-05 00:42:50 -0800 + Commit: 6580929 + + [MLlib] Minor: UDF style update. + Reynold Xin + 2015-02-04 23:57:53 -0800 + Commit: c3ba4d4, github.com/apache/spark/pull/4388 + + [SPARK-5612][SQL] Move DataFrame implicit functions into SQLContext.implicits. + Reynold Xin + 2015-02-04 23:44:34 -0800 + Commit: 7d789e1, github.com/apache/spark/pull/4386 + + [SPARK-5606][SQL] Support plus sign in HiveContext + q00251598 + 2015-02-04 23:16:01 -0800 + Commit: 9d3a75e, github.com/apache/spark/pull/4378 + + [SPARK-5599] Check MLlib public APIs for 1.3 + Xiangrui Meng + 2015-02-04 23:03:47 -0800 + Commit: db34690, github.com/apache/spark/pull/4377 + + [SPARK-5596] [mllib] ML model import/export for GLMs, NaiveBayes + Joseph K. Bradley + 2015-02-04 22:46:48 -0800 + Commit: 975bcef, github.com/apache/spark/pull/4233 + + SPARK-5607: Update to Kryo 2.24.0 to avoid including objenesis 1.2. + Patrick Wendell + 2015-02-04 22:39:44 -0800 + Commit: c23ac03, github.com/apache/spark/pull/4383 + + [SPARK-5602][SQL] Better support for creating DataFrame from local data collection + Reynold Xin + 2015-02-04 19:53:57 -0800 + Commit: 84acd08, github.com/apache/spark/pull/4372 + + [SPARK-5538][SQL] Fix flaky CachedTableSuite + Reynold Xin + 2015-02-04 19:52:41 -0800 + Commit: 206f9bc, github.com/apache/spark/pull/4379 + + [SQL][DataFrame] Minor cleanup. + Reynold Xin + 2015-02-04 19:51:48 -0800 + Commit: 6b4c7f0, github.com/apache/spark/pull/4374 + + [SPARK-4520] [SQL] This pr fixes the ArrayIndexOutOfBoundsException as r... + Sadhan Sood + 2015-02-04 19:18:06 -0800 + Commit: dba98bf, github.com/apache/spark/pull/4148 + + [SPARK-5605][SQL][DF] Allow using String to specify colum name in DSL aggregate functions + Reynold Xin + 2015-02-04 18:35:51 -0800 + Commit: 1fbd124, github.com/apache/spark/pull/4376 + + [SPARK-5411] Allow SparkListeners to be specified in SparkConf and loaded when creating SparkContext + Josh Rosen + 2015-02-04 17:18:03 -0800 + Commit: 9a7ce70, github.com/apache/spark/pull/4111 + + [SPARK-5577] Python udf for DataFrame + Davies Liu + 2015-02-04 15:55:09 -0800 + Commit: dc101b0, github.com/apache/spark/pull/4351 + + [SPARK-5118][SQL] Fix: create table test stored as parquet as select .. + guowei2 + 2015-02-04 15:26:10 -0800 + Commit: e0490e2, github.com/apache/spark/pull/3921 + + [SQL] Use HiveContext's sessionState in HiveMetastoreCatalog.hiveDefaultTableFilePath + Yin Huai + 2015-02-04 15:22:40 -0800 + Commit: 548c9c2, github.com/apache/spark/pull/4355 + + [SQL] Correct the default size of TimestampType and expose NumericType + Yin Huai + 2015-02-04 15:14:49 -0800 + Commit: 0d81645, github.com/apache/spark/pull/4314 + + [SQL][Hiveconsole] Bring hive console code up to date and update README.md + OopsOutOfMemory , Sheng, Li + 2015-02-04 15:13:54 -0800 + Commit: b73d5ff, github.com/apache/spark/pull/4330 + + [SPARK-5367][SQL] Support star expression in udfs + wangfei , scwf + 2015-02-04 15:12:07 -0800 + Commit: 417d111, github.com/apache/spark/pull/4353 + + [SPARK-5426][SQL] Add SparkSQL Java API helper methods. + kul + 2015-02-04 15:08:37 -0800 + Commit: 424cb69, github.com/apache/spark/pull/4243 + + [SPARK-5587][SQL] Support change database owner + wangfei + 2015-02-04 14:35:12 -0800 + Commit: b90dd39, github.com/apache/spark/pull/4357 + + [SPARK-5591][SQL] Fix NoSuchObjectException for CTAS + wangfei + 2015-02-04 14:33:07 -0800 + Commit: a9f0db1, github.com/apache/spark/pull/4365 + + [SPARK-4939] move to next locality when no pending tasks + Davies Liu + 2015-02-04 14:22:07 -0800 + Commit: 0a89b15, github.com/apache/spark/pull/3779 + + [SPARK-4707][STREAMING] Reliable Kafka Receiver can lose data if the blo... + Hari Shreedharan + 2015-02-04 14:20:44 -0800 + Commit: f0500f9, github.com/apache/spark/pull/3655 + + [SPARK-4964] [Streaming] Exactly-once semantics for Kafka + cody koeninger + 2015-02-04 12:06:34 -0800 + Commit: b0c0021, github.com/apache/spark/pull/3798 + + [SPARK-5588] [SQL] support select/filter by SQL expression + Davies Liu + 2015-02-04 11:34:46 -0800 + Commit: ac0b2b7, github.com/apache/spark/pull/4359 + + [SPARK-5585] Flaky test in MLlib python + Davies Liu + 2015-02-04 08:54:20 -0800 + Commit: 38a416f, github.com/apache/spark/pull/4358 + + [SPARK-5574] use given name prefix in dir + Imran Rashid + 2015-02-04 01:02:20 -0800 + Commit: 5aa0f21, github.com/apache/spark/pull/4344 + + [Minor] Fix incorrect warning log + Liang-Chi Hsieh + 2015-02-04 00:52:41 -0800 + Commit: a74cbbf, github.com/apache/spark/pull/4360 + + [SPARK-5379][Streaming] Add awaitTerminationOrTimeout + zsxwing + 2015-02-04 00:40:28 -0800 + Commit: 4cf4cba, github.com/apache/spark/pull/4171 + + [SPARK-5341] Use maven coordinates as dependencies in spark-shell and spark-submit + Burak Yavuz + 2015-02-03 22:39:17 -0800 + Commit: 6aed719, github.com/apache/spark/pull/4215 + + [SPARK-4939] revive offers periodically in LocalBackend + Davies Liu + 2015-02-03 22:30:23 -0800 + Commit: 83de71c, github.com/apache/spark/pull/4147 + + [SPARK-4969][STREAMING][PYTHON] Add binaryRecords to streaming + freeman + 2015-02-03 22:24:30 -0800 + Commit: 242b4f0, github.com/apache/spark/pull/3803 + + [SPARK-5579][SQL][DataFrame] Support for project/filter using SQL expressions + Reynold Xin + 2015-02-03 22:15:35 -0800 + Commit: 40c4cb2, github.com/apache/spark/pull/4348 + + [FIX][MLLIB] fix seed handling in Python GMM + Xiangrui Meng + 2015-02-03 20:39:11 -0800 + Commit: eb15631, github.com/apache/spark/pull/4349 + + [SPARK-4795][Core] Redesign the "primitive type => Writable" implicit APIs to make them be activated automatically + zsxwing + 2015-02-03 20:17:12 -0800 + Commit: d37978d, github.com/apache/spark/pull/3642 + + [SPARK-5578][SQL][DataFrame] Provide a convenient way for Scala users to use UDFs + Reynold Xin + 2015-02-03 20:07:46 -0800 + Commit: 1077f2e, github.com/apache/spark/pull/4345 + + [SPARK-5520][MLlib] Make FP-Growth implementation take generic item types (WIP) + Jacky Li , Jacky Li , Xiangrui Meng + 2015-02-03 17:02:42 -0800 + Commit: e380d2d, github.com/apache/spark/pull/4340 + + [SPARK-5554] [SQL] [PySpark] add more tests for DataFrame Python API + Davies Liu + 2015-02-03 16:01:56 -0800 + Commit: 068c0e2, github.com/apache/spark/pull/4331 + + [STREAMING] SPARK-4986 Wait for receivers to deregister and receiver job to terminate + Jesper Lundgren + 2015-02-03 14:53:39 -0800 + Commit: 1e8b539, github.com/apache/spark/pull/4338 + + [SPARK-5153][Streaming][Test] Increased timeout to deal with flaky KafkaStreamSuite + Tathagata Das + 2015-02-03 13:46:02 -0800 + Commit: 681f9df, github.com/apache/spark/pull/4342 + + [SPARK-4508] [SQL] build native date type to conform behavior to Hive + Daoyuan Wang + 2015-02-03 12:21:45 -0800 + Commit: db821ed, github.com/apache/spark/pull/4325 + + [SPARK-5383][SQL] Support alias for udtfs + wangfei , scwf , Fei Wang + 2015-02-03 12:16:31 -0800 + Commit: 5adbb39, github.com/apache/spark/pull/4186 + + [SPARK-5550] [SQL] Support the case insensitive for UDF + Cheng Hao + 2015-02-03 12:12:26 -0800 + Commit: ca7a6cd, github.com/apache/spark/pull/4326 + + [SPARK-4987] [SQL] parquet timestamp type support + Daoyuan Wang + 2015-02-03 12:06:06 -0800 + Commit: 0c20ce6, github.com/apache/spark/pull/3820 + + +Release 1.3.1 + + [SQL] Use path.makeQualified in newParquet. + Yin Huai + 2015-04-04 23:26:10 +0800 + Commit: eb57d4f, github.com/apache/spark/pull/5353 + + [SPARK-6700] disable flaky test + Davies Liu + 2015-04-03 15:22:21 -0700 + Commit: 3366af6, github.com/apache/spark/pull/5356 + + [SPARK-6688] [core] Always use resolved URIs in EventLoggingListener. + Marcelo Vanzin + 2015-04-03 11:54:31 -0700 + Commit: f17a2fe, github.com/apache/spark/pull/5340 + + [SPARK-6575][SQL] Converted Parquet Metastore tables no longer cache metadata + Yin Huai + 2015-04-03 14:40:36 +0800 + Commit: 0c1b78b, github.com/apache/spark/pull/5339 + + [SPARK-6621][Core] Fix the bug that calling EventLoop.stop in EventLoop.onReceive/onError/onStart doesn't call onStop + zsxwing + 2015-04-02 22:54:30 -0700 + Commit: ac705aa, github.com/apache/spark/pull/5280 + + [SPARK-6345][STREAMING][MLLIB] Fix for training with prediction + freeman + 2015-04-02 21:37:44 -0700 + Commit: d21f779, github.com/apache/spark/pull/5037 + + [CORE] The descriptionof jobHistory config should be spark.history.fs.logDirectory + KaiXinXiaoLei + 2015-04-02 20:24:31 -0700 + Commit: 17ab6b0, github.com/apache/spark/pull/5332 + + [SPARK-6575][SQL] Converted Parquet Metastore tables no longer cache metadata + Yin Huai + 2015-04-02 20:23:08 -0700 + Commit: 0c1c0fb, github.com/apache/spark/pull/5339 + + [SPARK-6650] [core] Stop ExecutorAllocationManager when context stops. + Marcelo Vanzin + 2015-04-02 19:48:55 -0700 + Commit: 0ef46b2, github.com/apache/spark/pull/5311 + + [SPARK-6686][SQL] Use resolved output instead of names for toDF rename + Michael Armbrust + 2015-04-02 18:30:55 -0700 + Commit: 2927af1, github.com/apache/spark/pull/5337 + + [SPARK-6672][SQL] convert row to catalyst in createDataFrame(RDD[Row], ...) + Xiangrui Meng + 2015-04-02 17:57:01 +0800 + Commit: c2694bb, github.com/apache/spark/pull/5329 + + [SPARK-6618][SPARK-6669][SQL] Lock Hive metastore client correctly. + Yin Huai , Michael Armbrust + 2015-04-02 16:46:50 -0700 + Commit: e6ee95c, github.com/apache/spark/pull/5333 + + [Minor] [SQL] Follow-up of PR #5210 + Cheng Lian + 2015-04-02 16:15:34 -0700 + Commit: 4f1fe3f, github.com/apache/spark/pull/5219 + + [SPARK-6655][SQL] We need to read the schema of a data source table stored in spark.sql.sources.schema property + Yin Huai + 2015-04-02 16:02:31 -0700 + Commit: aecec07, github.com/apache/spark/pull/5313 + + [SQL] Throw UnsupportedOperationException instead of NotImplementedError + Michael Armbrust + 2015-04-02 16:01:03 -0700 + Commit: 78ba245, github.com/apache/spark/pull/5315 + + SPARK-6414: Spark driver failed with NPE on job cancelation + Hung Lin + 2015-04-02 14:01:43 -0700 + Commit: 58e2b3f, github.com/apache/spark/pull/5124 + + [SPARK-6079] Use index to speed up StatusTracker.getJobIdsForGroup() + Josh Rosen + 2015-03-25 17:40:00 -0700 + Commit: a6664dc, github.com/apache/spark/pull/4830 + + [SPARK-6667] [PySpark] remove setReuseAddress + Davies Liu + 2015-04-02 12:18:33 -0700 + Commit: ee2bd70, github.com/apache/spark/pull/5324 + + Revert "[SPARK-6618][SQL] HiveMetastoreCatalog.lookupRelation should use fine-grained lock" + Cheng Lian + 2015-04-02 12:59:38 +0800 + Commit: 1160cc9 + + [SQL] SPARK-6658: Update DataFrame documentation to refer to correct types + Michael Armbrust + 2015-04-01 18:00:07 -0400 + Commit: 223dd3f + + [SPARK-6578] Small rewrite to make the logic more clear in MessageWithHeader.transferTo. + Reynold Xin + 2015-04-01 18:36:06 -0700 + Commit: d697b76, github.com/apache/spark/pull/5319 + + [SPARK-6660][MLLIB] pythonToJava doesn't recognize object arrays + Xiangrui Meng + 2015-04-01 18:17:07 -0700 + Commit: 0d1e476, github.com/apache/spark/pull/5318 + + [SPARK-6553] [pyspark] Support functools.partial as UDF + ksonj + 2015-04-01 17:23:57 -0700 + Commit: 98f72df, github.com/apache/spark/pull/5206 + + [SPARK-6642][MLLIB] use 1.2 lambda scaling and remove addImplicit from NormalEquation + Xiangrui Meng + 2015-04-01 16:47:18 -0700 + Commit: bc04fa2, github.com/apache/spark/pull/5314 + + [SPARK-6578] [core] Fix thread-safety issue in outbound path of network library. + Marcelo Vanzin + 2015-04-01 16:06:11 -0700 + Commit: 1c31ebd, github.com/apache/spark/pull/5234 + + [SPARK-6657] [Python] [Docs] fixed python doc build warnings + Joseph K. Bradley + 2015-04-01 15:15:47 -0700 + Commit: e347a7a, github.com/apache/spark/pull/5317 + + [SPARK-6651][MLLIB] delegate dense vector arithmetics to the underlying numpy array + Xiangrui Meng + 2015-04-01 13:29:04 -0700 + Commit: f50d95a, github.com/apache/spark/pull/5312 + + SPARK-6626 [DOCS]: Corrected Scala:TwitterUtils parameters + jayson + 2015-04-01 11:12:55 +0100 + Commit: 7d029cb, github.com/apache/spark/pull/5295 + + [Doc] Improve Python DataFrame documentation + Reynold Xin + 2015-03-31 18:31:36 -0700 + Commit: e527b35, github.com/apache/spark/pull/5287 + + [SPARK-6614] OutputCommitCoordinator should clear authorized committer only after authorized committer fails, not after any failure + Josh Rosen + 2015-03-31 16:18:39 -0700 + Commit: c4c982a, github.com/apache/spark/pull/5276 + + [SPARK-6633][SQL] Should be "Contains" instead of "EndsWith" when constructing sources.StringContains + Liang-Chi Hsieh + 2015-03-31 13:18:07 -0700 + Commit: d851646, github.com/apache/spark/pull/5299 + + [SPARK-5371][SQL] Propagate types after function conversion, before futher resolution + Michael Armbrust + 2015-03-31 11:34:29 -0700 + Commit: 5a957fe, github.com/apache/spark/pull/5278 + + [SPARK-6145][SQL] fix ORDER BY on nested fields + Michael Armbrust + 2015-03-31 11:23:18 -0700 + Commit: 045228f, github.com/apache/spark/pull/5189 + + [SPARK-6575] [SQL] Adds configuration to disable schema merging while converting metastore Parquet tables + Cheng Lian + 2015-03-31 11:21:15 -0700 + Commit: 778c876, github.com/apache/spark/pull/5231 + + [SPARK-6555] [SQL] Overrides equals() and hashCode() for MetastoreRelation + Cheng Lian + 2015-03-31 11:18:25 -0700 + Commit: 9ebefb1, github.com/apache/spark/pull/5289 + + [SPARK-6618][SQL] HiveMetastoreCatalog.lookupRelation should use fine-grained lock + Yin Huai + 2015-03-31 16:28:40 +0800 + Commit: fd600ce, github.com/apache/spark/pull/5281 + + [SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Python. + Reynold Xin + 2015-03-31 00:25:23 -0700 + Commit: cf651a4, github.com/apache/spark/pull/5284 + + [SPARK-6625][SQL] Add common string filters to data sources. + Reynold Xin + 2015-03-31 00:19:51 -0700 + Commit: a97d4e6, github.com/apache/spark/pull/5285 + + [SPARK-6119][SQL] DataFrame support for missing data handling + Reynold Xin + 2015-03-30 20:47:10 -0700 + Commit: 67c885e, github.com/apache/spark/pull/5274 + + [SPARK-6369] [SQL] Uses commit coordinator to help committing Hive and Parquet tables + Cheng Lian + 2015-03-31 07:48:37 +0800 + Commit: fedbfc7, github.com/apache/spark/pull/5139 + + [SPARK-6603] [PySpark] [SQL] add SQLContext.udf and deprecate inferSchema() and applySchema + Davies Liu + 2015-03-30 15:47:00 -0700 + Commit: 30e7c63, github.com/apache/spark/pull/5273 + + [SPARK-6592][SQL] fix filter for scaladoc to generate API doc for Row class under catalyst dir + CodingCat + 2015-03-30 11:54:44 -0700 + Commit: f9d4efa, github.com/apache/spark/pull/5252 + + [SPARK-6571][MLLIB] use wrapper in MatrixFactorizationModel.load + Xiangrui Meng + 2015-03-28 15:08:05 -0700 + Commit: 93a7166, github.com/apache/spark/pull/5243 + + [SPARK-6595][SQL] MetastoreRelation should be a MultiInstanceRelation + Michael Armbrust + 2015-03-30 22:24:12 +0800 + Commit: c411530, github.com/apache/spark/pull/5251 + + [SPARK-6558] Utils.getCurrentUserName returns the full principal name instead of login name + Thomas Graves + 2015-03-29 12:43:30 +0100 + Commit: f8132de, github.com/apache/spark/pull/5229 + + [SPARK-5750][SPARK-3441][SPARK-5836][CORE] Added documentation explaining shuffle + Ilya Ganelin , Ilya Ganelin + 2015-03-30 11:52:02 +0100 + Commit: 1c59a4b, github.com/apache/spark/pull/5074 + + [spark-sql] a better exception message than "scala.MatchError" for unsupported types in Schema creation + Eran Medan + 2015-03-30 00:02:52 -0700 + Commit: 4859c40, github.com/apache/spark/pull/5235 + + [HOTFIX] Build break due to NoRelation cherry-pick. + Reynold Xin + 2015-03-29 12:07:28 -0700 + Commit: 6181366 + + [DOC] Improvements to Python docs. + Reynold Xin + 2015-03-28 23:59:27 -0700 + Commit: 3db0844, github.com/apache/spark/pull/5238 + + [SPARK-6538][SQL] Add missing nullable Metastore fields when merging a Parquet schema + Adam Budde + 2015-03-28 09:14:09 +0800 + Commit: 5e04f45, github.com/apache/spark/pull/5214 + + [SPARK-6564][SQL] SQLContext.emptyDataFrame should contain 0 row, not 1 row + Reynold Xin + 2015-03-27 14:56:57 -0700 + Commit: 7006858, github.com/apache/spark/pull/5226 + + [SPARK-6544][build] Increment Avro version from 1.7.6 to 1.7.7 + Dean Chen + 2015-03-27 14:32:51 +0000 + Commit: fefd49f, github.com/apache/spark/pull/5193 + + [SPARK-6574] [PySpark] fix sql example + Davies Liu + 2015-03-27 11:42:26 -0700 + Commit: b902a95, github.com/apache/spark/pull/5230 + + [SPARK-6550][SQL] Use analyzed plan in DataFrame + Michael Armbrust + 2015-03-27 11:40:00 -0700 + Commit: bc75189, github.com/apache/spark/pull/5217 + + [SPARK-6341][mllib] Upgrade breeze from 0.11.1 to 0.11.2 + Yu ISHIKAWA + 2015-03-27 00:15:02 -0700 + Commit: b318858, github.com/apache/spark/pull/5222 + + [DOCS][SQL] Fix JDBC example + Michael Armbrust + 2015-03-26 14:51:46 -0700 + Commit: 54d92b5, github.com/apache/spark/pull/5192 + + [SPARK-6554] [SQL] Don't push down predicates which reference partition column(s) + Cheng Lian + 2015-03-26 13:11:37 -0700 + Commit: 3d54578, github.com/apache/spark/pull/5210 + + [SPARK-6117] [SQL] Improvements to DataFrame.describe() + Reynold Xin + 2015-03-26 12:26:13 -0700 + Commit: 28e3a1e, github.com/apache/spark/pull/5201 + + [SPARK-6117] [SQL] add describe function to DataFrame for summary statis... + azagrebin + 2015-03-26 00:25:04 -0700 + Commit: 84735c3, github.com/apache/spark/pull/5073 + + SPARK-6480 [CORE] histogram() bucket function is wrong in some simple edge cases + Sean Owen + 2015-03-26 15:00:23 +0000 + Commit: aa2d157, github.com/apache/spark/pull/5148 + + [SPARK-6491] Spark will put the current working dir to the CLASSPATH + guliangliang + 2015-03-26 13:28:56 +0000 + Commit: 5b5f0e2, github.com/apache/spark/pull/5156 + + [SQL][SPARK-6471]: Metastore schema should only be a subset of parquet schema to support dropping of columns using replace columns + Yash Datta + 2015-03-26 21:13:38 +0800 + Commit: 836c921, github.com/apache/spark/pull/5141 + + [SPARK-6465][SQL] Fix serialization of GenericRowWithSchema using kryo + Michael Armbrust + 2015-03-26 18:46:57 +0800 + Commit: 8254996, github.com/apache/spark/pull/5191 + + [SPARK-6536] [PySpark] Column.inSet() in Python + Davies Liu + 2015-03-26 00:01:24 -0700 + Commit: 0ba7599, github.com/apache/spark/pull/5190 + + [SPARK-6463][SQL] AttributeSet.equal should compare size + sisihj , Michael Armbrust + 2015-03-25 19:21:54 -0700 + Commit: 9edb34f, github.com/apache/spark/pull/5194 + + [SPARK-6450] [SQL] Fixes metastore Parquet table conversion + Cheng Lian + 2015-03-25 17:40:19 -0700 + Commit: 0cd4748, github.com/apache/spark/pull/5183 + + [SPARK-6409][SQL] It is not necessary that avoid old inteface of hive, because this will make some UDAF can not work. + DoingDone9 <799203320@qq.com> + 2015-03-25 11:11:52 -0700 + Commit: 4efa6c5, github.com/apache/spark/pull/5131 + + SPARK-6063 MLlib doesn't pass mvn scalastyle check due to UTF chars in LDAModel.scala + Michael Griffiths , Griffiths, Michael (NYC-RPM) + 2015-02-28 14:47:39 +0000 + Commit: 6791f42, github.com/apache/spark/pull/4815 + + [SPARK-6496] [MLLIB] GeneralizedLinearAlgorithm.run(input, initialWeights) should initialize numFeatures + Yanbo Liang + 2015-03-25 17:05:56 +0000 + Commit: 2be4255, github.com/apache/spark/pull/5167 + + [DOCUMENTATION]Fixed Missing Type Import in Documentation + Bill Chambers , anabranch + 2015-03-24 22:24:35 -0700 + Commit: 8e4e2e3, github.com/apache/spark/pull/5179 + + [SPARK-6469] Improving documentation on YARN local directories usage + Christophe Préaud + 2015-03-24 17:05:49 -0700 + Commit: 6af9408, github.com/apache/spark/pull/5165 + + [SPARK-3570] Include time to open files in shuffle write time. + Kay Ousterhout + 2015-03-24 16:29:40 -0700 + Commit: e4db5a3, github.com/apache/spark/pull/4550 + + [SPARK-6088] Correct how tasks that get remote results are shown in UI. + Kay Ousterhout + 2015-03-24 16:26:43 -0700 + Commit: de8b2d4, github.com/apache/spark/pull/4839 + + [SPARK-6428][SQL] Added explicit types for all public methods in catalyst + Reynold Xin + 2015-03-24 16:03:55 -0700 + Commit: 586e0d9, github.com/apache/spark/pull/5162 + + [SPARK-6209] Clean up connections in ExecutorClassLoader after failing to load classes (master branch PR) + Josh Rosen + 2015-03-24 14:38:20 -0700 + Commit: dcf56aa, github.com/apache/spark/pull/4944 + + [SPARK-6458][SQL] Better error messages for invalid data sources + Michael Armbrust + 2015-03-24 14:10:56 -0700 + Commit: f48c16d, github.com/apache/spark/pull/5158 + + [SPARK-6376][SQL] Avoid eliminating subqueries until optimization + Michael Armbrust + 2015-03-24 14:08:20 -0700 + Commit: df671bc, github.com/apache/spark/pull/5160 + + [SPARK-6375][SQL] Fix formatting of error messages. + Michael Armbrust + 2015-03-24 13:22:46 -0700 + Commit: 92bf888, github.com/apache/spark/pull/5155 + + Revert "[SPARK-5680][SQL] Sum function on all null values, should return zero" + Michael Armbrust + 2015-03-24 12:32:25 -0700 + Commit: 930b667 + + [SPARK-6054][SQL] Fix transformations of TreeNodes that hold StructTypes + Michael Armbrust + 2015-03-24 12:28:01 -0700 + Commit: c699e2b, github.com/apache/spark/pull/5157 + + [SPARK-6437][SQL] Use completion iterator to close external sorter + Michael Armbrust + 2015-03-24 12:10:30 -0700 + Commit: c0101d3, github.com/apache/spark/pull/5161 + + [SPARK-6459][SQL] Warn when constructing trivially true equals predicate + Michael Armbrust + 2015-03-24 12:09:02 -0700 + Commit: f0141ca, github.com/apache/spark/pull/5163 + + [SPARK-5955][MLLIB] add checkpointInterval to ALS + Xiangrui Meng + 2015-03-20 15:02:57 -0400 + Commit: bc92a2e, github.com/apache/spark/pull/5076 + + [ML][docs][minor] Define LabeledDocument/Document classes in CV example + Peter Rudenko + 2015-03-24 16:33:38 +0000 + Commit: 4ff5771, github.com/apache/spark/pull/5135 + + [SPARK-5559] [Streaming] [Test] Remove oppotunity we met flakiness when running FlumeStreamSuite + Kousuke Saruta + 2015-03-24 16:13:25 +0000 + Commit: 8722369, github.com/apache/spark/pull/4337 + + Update the command to use IPython notebook + Cong Yue + 2015-03-24 12:56:13 +0000 + Commit: e545143, github.com/apache/spark/pull/5111 + + [SPARK-6452] [SQL] Checks for missing attributes and unresolved operator for all types of operator + Cheng Lian + 2015-03-24 01:12:11 -0700 + Commit: 6f10142, github.com/apache/spark/pull/5129 + + [SPARK-6124] Support jdbc connection properties in OPTIONS part of the query + Volodymyr Lyubinets + 2015-03-23 17:00:27 -0700 + Commit: 04b2078, github.com/apache/spark/pull/4859 + + [SPARK-6397][SQL] Check the missingInput simply + Yadong Qi + 2015-03-23 18:16:49 +0800 + Commit: a29f493, github.com/apache/spark/pull/5132 + + [SPARK-4985] [SQL] parquet support for date type + Daoyuan Wang + 2015-03-23 11:46:16 +0800 + Commit: 60b9b96, github.com/apache/spark/pull/3822 + + [SPARK-6337][Documentation, SQL]Spark 1.3 doc fixes + vinodkc + 2015-03-22 20:00:08 +0000 + Commit: 857e8a6, github.com/apache/spark/pull/5112 + + SPARK-6454 [DOCS] Fix links to pyspark api + Kamil Smuga , stderr + 2015-03-22 15:56:25 +0000 + Commit: 3ba295f, github.com/apache/spark/pull/5120 + + [SPARK-6408] [SQL] Fix JDBCRDD filtering string literals + ypcat , Pei-Lun Lee + 2015-03-22 15:49:13 +0800 + Commit: e60fbf6, github.com/apache/spark/pull/5087 + + [SPARK-6428][SQL] Added explicit type for all public methods for Hive module + Reynold Xin + 2015-03-21 14:30:04 -0700 + Commit: 0021d22, github.com/apache/spark/pull/5108 + + [SPARK-6428][SQL] Added explicit type for all public methods in sql/core + Reynold Xin + 2015-03-20 15:47:07 -0700 + Commit: c964588, github.com/apache/spark/pull/5104 + + [SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser. + Yin Huai + 2015-03-21 13:27:53 -0700 + Commit: 102daaf, github.com/apache/spark/pull/5078 + + [SPARK-5680][SQL] Sum function on all null values, should return zero + Venkata Ramana G , Venkata Ramana Gollamudi + 2015-03-21 13:24:24 -0700 + Commit: 93975a3, github.com/apache/spark/pull/4466 + + [SPARK-5320][SQL]Add statistics method at NoRelation (override super). + x1- + 2015-03-21 13:22:34 -0700 + Commit: cba6842, github.com/apache/spark/pull/5105 + + [SPARK-5821] [SQL] JSON CTAS command should throw error message when delete path failure + Yanbo Liang , Yanbo Liang + 2015-03-21 11:23:28 +0800 + Commit: 8de90c7, github.com/apache/spark/pull/4610 + + [SPARK-6315] [SQL] Also tries the case class string parser while reading Parquet schema + Cheng Lian + 2015-03-21 11:18:45 +0800 + Commit: b75943f, github.com/apache/spark/pull/5034 + + [SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful + Yanbo Liang + 2015-03-21 10:53:04 +0800 + Commit: df83e21, github.com/apache/spark/pull/5107 + + [SPARK-6421][MLLIB] _regression_train_wrapper does not test initialWeights correctly + lewuathe + 2015-03-20 17:18:18 -0400 + Commit: aff9f8d, github.com/apache/spark/pull/5101 + + [SPARK-6286][Mesos][minor] Handle missing Mesos case TASK_ERROR + Jongyoul Lee + 2015-03-20 12:24:34 +0000 + Commit: db812d9, github.com/apache/spark/pull/5088 + + [SPARK-6222][Streaming] Dont delete checkpoint data when doing pre-batch-start checkpoint + Tathagata Das + 2015-03-19 02:15:50 -0400 + Commit: 03e263f, github.com/apache/spark/pull/5008 + + [SPARK-6325] [core,yarn] Do not change target executor count when killing executors. + Marcelo Vanzin + 2015-03-18 09:18:28 -0400 + Commit: 1723f05, github.com/apache/spark/pull/5018 + + [SPARK-6286][minor] Handle missing Mesos case TASK_ERROR. + Iulian Dragos + 2015-03-18 09:15:33 -0400 + Commit: ff0a7f4, github.com/apache/spark/pull/5000 + + [SPARK-6247][SQL] Fix resolution of ambiguous joins caused by new aliases + Michael Armbrust + 2015-03-17 19:47:51 -0700 + Commit: ba8352c, github.com/apache/spark/pull/5062 + + [SPARK-6383][SQL]Fixed compiler and errors in Dataframe examples + Tijo Thomas + 2015-03-17 18:50:19 -0700 + Commit: cee6d08, github.com/apache/spark/pull/5068 + + [SPARK-6366][SQL] In Python API, the default save mode for save and saveAsTable should be "error" instead of "append". + Yin Huai + 2015-03-18 09:41:06 +0800 + Commit: 3ea38bc, github.com/apache/spark/pull/5053 + + [SPARK-6330] [SQL] Add a test case for SPARK-6330 + Pei-Lun Lee + 2015-03-18 08:34:46 +0800 + Commit: 9d88f0c, github.com/apache/spark/pull/5039 + + [SPARK-6336] LBFGS should document what convergenceTol means + lewuathe + 2015-03-17 12:11:57 -0700 + Commit: 476c4e1, github.com/apache/spark/pull/5033 + + [SPARK-6365] jetty-security also needed for SPARK_PREPEND_CLASSES to work + Imran Rashid + 2015-03-17 12:03:54 -0500 + Commit: ac0e7cc, github.com/apache/spark/pull/5071 + + [SPARK-6313] Add config option to disable file locks/fetchFile cache to ... + nemccarthy + 2015-03-17 09:33:11 -0700 + Commit: febb123, github.com/apache/spark/pull/5036 + + [SPARK-3266] Use intermediate abstract classes to fix type erasure issues in Java APIs + Josh Rosen + 2015-03-17 09:18:57 -0700 + Commit: 29e39e1, github.com/apache/spark/pull/5050 + + [SPARK-6331] Load new master URL if present when recovering streaming context from checkpoint + Tathagata Das + 2015-03-17 05:31:27 -0700 + Commit: 95f8d1c, github.com/apache/spark/pull/5024 + + [SQL][docs][minor] Fixed sample code in SQLContext scaladoc + Lomig Mégard + 2015-03-16 23:52:42 -0700 + Commit: 426816b, github.com/apache/spark/pull/5051 + + [SPARK-6299][CORE] ClassNotFoundException in standalone mode when running groupByKey with class defined in REPL + Kevin (Sangwoo) Kim + 2015-03-16 23:49:23 -0700 + Commit: 5c16ced, github.com/apache/spark/pull/5046 + + [SPARK-6077] Remove streaming tab while stopping StreamingContext + lisurprise + 2015-03-16 13:10:32 -0700 + Commit: 47cce98, github.com/apache/spark/pull/4828 + + [SPARK-6330] Fix filesystem bug in newParquet relation + Volodymyr Lyubinets + 2015-03-16 12:13:18 -0700 + Commit: 67fa6d1, github.com/apache/spark/pull/5020 + + SPARK-6245 [SQL] jsonRDD() of empty RDD results in exception + Sean Owen + 2015-03-11 14:09:09 +0000 + Commit: 684ff24, github.com/apache/spark/pull/4971 + + [SPARK-6300][Spark Core] sc.addFile(path) does not support the relative path. + DoingDone9 <799203320@qq.com> + 2015-03-16 12:27:15 +0000 + Commit: 724aab4, github.com/apache/spark/pull/4993 + + [SPARK-3619] Part 2. Upgrade to Mesos 0.21 to work around MESOS-1688 + Jongyoul Lee + 2015-03-15 15:46:55 +0000 + Commit: 43fcab0, github.com/apache/spark/pull/4361 + + [SPARK-6210] [SQL] use prettyString as column name in agg() + Davies Liu + 2015-03-14 00:43:33 -0700 + Commit: ad47563, github.com/apache/spark/pull/5006 + + [SPARK-6275][Documentation]Miss toDF() function in docs/sql-programming-guide.md + zzcclp + 2015-03-12 15:07:15 +0000 + Commit: 3012781, github.com/apache/spark/pull/4977 + + [SPARK-6133] Make sc.stop() idempotent + Andrew Or + 2015-03-03 15:09:57 -0800 + Commit: a08588c, github.com/apache/spark/pull/4871 + + [SPARK-6132][HOTFIX] ContextCleaner InterruptedException should be quiet + Andrew Or + 2015-03-03 20:49:45 -0800 + Commit: 338bea7, github.com/apache/spark/pull/4882 + + [SPARK-6132] ContextCleaner race condition across SparkContexts + Andrew Or + 2015-03-03 13:44:05 -0800 + Commit: 3cdc8a3, github.com/apache/spark/pull/4869 + + [SPARK-6087][CORE] Provide actionable exception if Kryo buffer is not large enough + Lev Khomich + 2015-03-10 10:55:42 +0000 + Commit: 9846790, github.com/apache/spark/pull/4947 + + [SPARK-6036][CORE] avoid race condition between eventlogListener and akka actor system + Zhang, Liye + 2015-02-26 23:11:43 -0800 + Commit: f81611d, github.com/apache/spark/pull/4785 + + SPARK-4044 [CORE] Thriftserver fails to start when JAVA_HOME points to JRE instead of JDK + Sean Owen + 2015-03-13 17:59:31 +0000 + Commit: 4aa4132, github.com/apache/spark/pull/4981 + + SPARK-4300 [CORE] Race condition during SparkWorker shutdown + Sean Owen + 2015-02-26 14:08:56 -0800 + Commit: a3493eb, github.com/apache/spark/pull/4787 + + [SPARK-6194] [SPARK-677] [PySpark] fix memory leak in collect() + Davies Liu + 2015-03-09 16:24:06 -0700 + Commit: 170af49, github.com/apache/spark/pull/4923 + + SPARK-4704 [CORE] SparkSubmitDriverBootstrap doesn't flush output + Sean Owen + 2015-02-26 12:56:54 -0800 + Commit: dbee7e1, github.com/apache/spark/pull/4788 + + [SPARK-6278][MLLIB] Mention the change of objective in linear regression + Xiangrui Meng + 2015-03-13 10:27:28 -0700 + Commit: 214f681, github.com/apache/spark/pull/4978 + + [SPARK-5310] [SQL] [DOC] Parquet section for the SQL programming guide + Cheng Lian + 2015-03-13 21:34:50 +0800 + Commit: dc287f3, github.com/apache/spark/pull/5001 + + [mllib] [python] Add LassoModel to __all__ in regression.py + Joseph K. Bradley + 2015-03-12 16:46:29 -0700 + Commit: 23069bd, github.com/apache/spark/pull/4970 + + [SPARK-6294] fix hang when call take() in JVM on PythonRDD + Davies Liu + 2015-03-12 01:34:38 -0700 + Commit: 850e694, github.com/apache/spark/pull/4987 + + [SPARK-6296] [SQL] Added equals to Column + Volodymyr Lyubinets + 2015-03-12 00:55:26 -0700 + Commit: d9e141c, github.com/apache/spark/pull/4988 + + [SPARK-6128][Streaming][Documentation] Updates to Spark Streaming Programming Guide + Tathagata Das + 2015-03-11 18:48:21 -0700 + Commit: bdc4682, github.com/apache/spark/pull/4956 + + [SPARK-6274][Streaming][Examples] Added examples streaming + sql examples. + Tathagata Das + 2015-03-11 11:19:51 -0700 + Commit: ac61466, github.com/apache/spark/pull/4975 + + [SPARK-5183][SQL] Update SQL Docs with JDBC and Migration Guide + Michael Armbrust + 2015-03-10 18:13:09 -0700 + Commit: edbcb6f, github.com/apache/spark/pull/4958 + + Minor doc: Remove the extra blank line in data types javadoc. + Reynold Xin + 2015-03-10 17:25:04 -0700 + Commit: 7295192, github.com/apache/spark/pull/4955 + + [SPARK-5310][Doc] Update SQL Programming Guide to include DataFrames. + Reynold Xin + 2015-03-09 16:16:16 -0700 + Commit: bc53d3d, github.com/apache/spark/pull/4954 + + [Docs] Replace references to SchemaRDD with DataFrame + Reynold Xin + 2015-03-09 13:29:19 -0700 + Commit: 5e58f76, github.com/apache/spark/pull/4952 + + Preparing development version 1.3.1-SNAPSHOT + Patrick Wendell + 2015-03-05 23:02:08 +0000 + Commit: c152f9a + + +Release 1.3.0 + + [SQL] Make Strategies a public developer API + Michael Armbrust + 2015-03-05 14:50:25 -0800 + Commit: 556e0de, github.com/apache/spark/pull/4920 + + [SPARK-6163][SQL] jsonFile should be backed by the data source API + Yin Huai + 2015-03-05 14:49:44 -0800 + Commit: 083fed5, github.com/apache/spark/pull/4896 + + [SPARK-6145][SQL] fix ORDER BY on nested fields + Wenchen Fan , Michael Armbrust + 2015-03-05 14:49:01 -0800 + Commit: e358f55, github.com/apache/spark/pull/4918 + + [SPARK-6175] Fix standalone executor log links when ephemeral ports or SPARK_PUBLIC_DNS are used + Josh Rosen + 2015-03-05 12:04:00 -0800 + Commit: 988b498, github.com/apache/spark/pull/4903 + + SPARK-6182 [BUILD] spark-parent pom needs to be published for both 2.10 and 2.11 + Sean Owen + 2015-03-05 11:31:48 -0800 + Commit: ae315d2, github.com/apache/spark/pull/4912 + + Revert "[SPARK-6153] [SQL] promote guava dep for hive-thriftserver" + Cheng Lian + 2015-03-05 17:58:18 +0800 + Commit: f8205d3 + + [SPARK-6153] [SQL] promote guava dep for hive-thriftserver + Daoyuan Wang + 2015-03-05 16:35:17 +0800 + Commit: b92d925, github.com/apache/spark/pull/4884 + + Updating CHANGES file + Patrick Wendell + 2015-03-04 21:19:49 -0800 + Commit: 87eac3c + + SPARK-5143 [BUILD] [WIP] spark-network-yarn 2.11 depends on spark-network-shuffle 2.10 + Sean Owen + 2015-03-04 21:00:51 -0800 + Commit: f509159, github.com/apache/spark/pull/4876 + + [SPARK-6149] [SQL] [Build] Excludes Guava 15 referenced by jackson-module-scala_2.10 + Cheng Lian + 2015-03-04 20:52:58 -0800 + Commit: a0aa24a, github.com/apache/spark/pull/4890 + + [SPARK-6144] [core] Fix addFile when source files are on "hdfs:" + Marcelo Vanzin , trystanleftwich + 2015-03-04 12:58:39 -0800 + Commit: 3fc74f4, github.com/apache/spark/pull/4894 + + [SPARK-6134][SQL] Fix wrong datatype for casting FloatType and default LongType value in defaultPrimitive + Liang-Chi Hsieh + 2015-03-04 20:23:43 +0800 + Commit: bfa4e31, github.com/apache/spark/pull/4870 + + [SPARK-6136] [SQL] Removed JDBC integration tests which depends on docker-client + Cheng Lian + 2015-03-04 19:39:02 +0800 + Commit: 035243d, github.com/apache/spark/pull/4872 + + [SPARK-6141][MLlib] Upgrade Breeze from 0.10 to 0.11 to fix convergence bug + Xiangrui Meng , DB Tsai , DB Tsai + 2015-03-03 23:52:02 -0800 + Commit: 9f24977, github.com/apache/spark/pull/4879 + + [SPARK-5949] HighlyCompressedMapStatus needs more classes registered w/ kryo + Imran Rashid + 2015-03-03 15:33:19 -0800 + Commit: 9a0b75c, github.com/apache/spark/pull/4877 + + SPARK-1911 [DOCS] Warn users if their assembly jars are not built with Java 6 + Sean Owen + 2015-03-03 13:40:11 -0800 + Commit: 8446ad0, github.com/apache/spark/pull/4874 + + Revert "[SPARK-5423][Core] Cleanup resources in DiskMapIterator.finalize to ensure deleting the temp file" + Andrew Or + 2015-03-03 13:04:15 -0800 + Commit: ee4929d + + Adding CHANGES.txt for Spark 1.3 + Patrick Wendell + 2015-03-03 02:19:19 -0800 + Commit: ce7158c + + BUILD: Minor tweaks to internal build scripts + Patrick Wendell + 2015-03-03 00:38:12 -0800 + Commit: ae60eb9 + + HOTFIX: Bump HBase version in MapR profiles. + Patrick Wendell + 2015-03-03 01:38:07 -0800 + Commit: 1aa8461 + + [SPARK-5537][MLlib][Docs] Add user guide for multinomial logistic regression + DB Tsai + 2015-03-02 22:37:12 -0800 + Commit: 841d2a2, github.com/apache/spark/pull/4866 + + [SPARK-6120] [mllib] Warnings about memory in tree, ensemble model save + Joseph K. Bradley + 2015-03-02 22:33:51 -0800 + Commit: 81648a7, github.com/apache/spark/pull/4864 + + [SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib + Xiangrui Meng + 2015-03-02 22:27:01 -0800 + Commit: 62c53be, github.com/apache/spark/pull/4854 + + [SPARK-5310][SQL] Fixes to Docs and Datasources API + Reynold Xin , Michael Armbrust + 2015-03-02 22:14:08 -0800 + Commit: 4e6e008, github.com/apache/spark/pull/4868 + + [SPARK-5950][SQL]Insert array into a metastore table saved as parquet should work when using datasource api + Yin Huai + 2015-03-02 19:31:55 -0800 + Commit: 1b490e9, github.com/apache/spark/pull/4826 + + [SPARK-6127][Streaming][Docs] Add Kafka to Python api docs + Tathagata Das + 2015-03-02 18:40:46 -0800 + Commit: ffd0591, github.com/apache/spark/pull/4860 + + [SPARK-5537] Add user guide for multinomial logistic regression + Xiangrui Meng , DB Tsai + 2015-03-02 18:10:50 -0800 + Commit: 11389f0, github.com/apache/spark/pull/4801 + + [SPARK-6121][SQL][MLLIB] simpleString for UDT + Xiangrui Meng + 2015-03-02 17:14:34 -0800 + Commit: 1b8ab57, github.com/apache/spark/pull/4858 + + [SPARK-6048] SparkConf should not translate deprecated configs on set + Andrew Or + 2015-03-02 16:36:42 -0800 + Commit: ea69cf2, github.com/apache/spark/pull/4799 + + [SPARK-6066] Make event log format easier to parse + Andrew Or + 2015-03-02 16:34:32 -0800 + Commit: 8100b79, github.com/apache/spark/pull/4821 + + [SPARK-6082] [SQL] Provides better error message for malformed rows when caching tables + Cheng Lian + 2015-03-02 16:18:00 -0800 + Commit: 866f281, github.com/apache/spark/pull/4842 + + [SPARK-6114][SQL] Avoid metastore conversions before plan is resolved + Michael Armbrust + 2015-03-02 16:10:54 -0800 + Commit: 3899c7c, github.com/apache/spark/pull/4855 + + [SPARK-6050] [yarn] Relax matching of vcore count in received containers. + Marcelo Vanzin + 2015-03-02 16:41:43 -0600 + Commit: 650d1e7, github.com/apache/spark/pull/4818 + + [SPARK-6040][SQL] Fix the percent bug in tablesample + q00251598 + 2015-03-02 13:16:29 -0800 + Commit: a83b9bb, github.com/apache/spark/pull/4789 + + [Minor] Fix doc typo for describing primitiveTerm effectiveness condition + Liang-Chi Hsieh + 2015-03-02 13:11:17 -0800 + Commit: f92876a, github.com/apache/spark/pull/4762 + + SPARK-5390 [DOCS] Encourage users to post on Stack Overflow in Community Docs + Sean Owen + 2015-03-02 21:10:08 +0000 + Commit: 58e7198, github.com/apache/spark/pull/4843 + + [DOCS] Refactored Dataframe join comment to use correct parameter ordering + Paul Power + 2015-03-02 13:08:47 -0800 + Commit: 54ac243, github.com/apache/spark/pull/4847 + + [SPARK-6080] [PySpark] correct LogisticRegressionWithLBFGS regType parameter for pyspark + Yanbo Liang + 2015-03-02 10:17:24 -0800 + Commit: 4ffaf85, github.com/apache/spark/pull/4831 + + [SPARK-5741][SQL] Support the path contains comma in HiveContext + q00251598 + 2015-03-02 10:13:11 -0800 + Commit: f476108, github.com/apache/spark/pull/4532 + + [SPARK-6111] Fixed usage string in documentation. + Kenneth Myers + 2015-03-02 17:25:24 +0000 + Commit: b2b7f01, github.com/apache/spark/pull/4852 + + [SPARK-6052][SQL]In JSON schema inference, we should always set containsNull of an ArrayType to true + Yin Huai + 2015-03-02 23:18:07 +0800 + Commit: a3fef2c, github.com/apache/spark/pull/4806 + + [SPARK-6073][SQL] Need to refresh metastore cache after append data in CreateMetastoreDataSourceAsSelect + Yin Huai + 2015-03-02 22:42:18 +0800 + Commit: c59871c, github.com/apache/spark/pull/4824 + + [Streaming][Minor]Fix some error docs in streaming examples + Saisai Shao + 2015-03-02 08:49:19 +0000 + Commit: 1fe677a, github.com/apache/spark/pull/4837 + + [SPARK-6083] [MLLib] [DOC] Make Python API example consistent in NaiveBayes + MechCoder + 2015-03-01 16:28:15 -0800 + Commit: 6a2fc85, github.com/apache/spark/pull/4834 + + [SPARK-6053][MLLIB] support save/load in PySpark's ALS + Xiangrui Meng + 2015-03-01 16:26:57 -0800 + Commit: b570d98, github.com/apache/spark/pull/4811 + + [SPARK-6074] [sql] Package pyspark sql bindings. + Marcelo Vanzin + 2015-03-01 11:05:10 +0000 + Commit: bb16618, github.com/apache/spark/pull/4822 + + SPARK-5984: Fix TimSort bug causes ArrayOutOfBoundsException + Evan Yu + 2015-02-28 18:55:34 -0800 + Commit: 317694c, github.com/apache/spark/pull/4804 + + [SPARK-5775] [SQL] BugFix: GenericRow cannot be cast to SpecificMutableRow when nested data and partitioned table + Cheng Lian , Cheng Lian , Yin Huai + 2015-02-28 21:15:43 +0800 + Commit: aa39460, github.com/apache/spark/pull/4792 + + [SPARK-5979][SPARK-6032] Smaller safer --packages fix + Burak Yavuz + 2015-02-27 22:59:35 -0800 + Commit: 5a55c96, github.com/apache/spark/pull/4802 + + [SPARK-6070] [yarn] Remove unneeded classes from shuffle service jar. + Marcelo Vanzin + 2015-02-27 22:44:11 -0800 + Commit: 1747e0a, github.com/apache/spark/pull/4820 + + [SPARK-6055] [PySpark] fix incorrect __eq__ of DataType + Davies Liu + 2015-02-27 20:07:17 -0800 + Commit: 49f2187, github.com/apache/spark/pull/4808 + + [SPARK-5751] [SQL] Sets SPARK_HOME as SPARK_PID_DIR when running Thrift server test suites + Cheng Lian + 2015-02-28 08:41:49 +0800 + Commit: 5d19cf0, github.com/apache/spark/pull/4758 + + [Streaming][Minor] Remove useless type signature of Java Kafka direct stream API + Saisai Shao + 2015-02-27 13:01:42 -0800 + Commit: ceebe3c, github.com/apache/spark/pull/4817 + + [SPARK-4587] [mllib] [docs] Fixed save,load calls in ML guide examples + Joseph K. Bradley + 2015-02-27 13:00:36 -0800 + Commit: 117e10c, github.com/apache/spark/pull/4816 + + [SPARK-6058][Yarn] Log the user class exception in ApplicationMaster + zsxwing + 2015-02-27 13:31:46 +0000 + Commit: bff8088, github.com/apache/spark/pull/4813 + + fix spark-6033, clarify the spark.worker.cleanup behavior in standalone mode + 许鹏 + 2015-02-26 23:05:56 -0800 + Commit: b8db84c, github.com/apache/spark/pull/4803 + + SPARK-2168 [Spark core] Use relative URIs for the app links in the History Server. + Lukasz Jastrzebski + 2015-02-26 22:38:06 -0800 + Commit: 485b919, github.com/apache/spark/pull/4778 + + [SPARK-6024][SQL] When a data source table has too many columns, it's schema cannot be stored in metastore. + Yin Huai + 2015-02-26 20:46:05 -0800 + Commit: 6200f07, github.com/apache/spark/pull/4795 + + [SPARK-6037][SQL] Avoiding duplicate Parquet schema merging + Liang-Chi Hsieh + 2015-02-27 11:06:47 +0800 + Commit: 25a109e, github.com/apache/spark/pull/4786 + + SPARK-4579 [WEBUI] Scheduling Delay appears negative + Sean Owen + 2015-02-26 17:35:09 -0800 + Commit: b83a93e, github.com/apache/spark/pull/4796 + + [SPARK-5951][YARN] Remove unreachable driver memory properties in yarn client mode + mohit.goyal + 2015-02-26 14:27:47 -0800 + Commit: 5b426cb, github.com/apache/spark/pull/4730 + + Add a note for context termination for History server on Yarn + moussa taifi + 2015-02-26 14:19:43 -0800 + Commit: 297c3ef, github.com/apache/spark/pull/4721 + + [SPARK-6018] [YARN] NoSuchMethodError in Spark app is swallowed by YARN AM + Cheolsoo Park + 2015-02-26 13:53:49 -0800 + Commit: fe79674, github.com/apache/spark/pull/4773 + + [SPARK-6027][SPARK-5546] Fixed --jar and --packages not working for KafkaUtils and improved error message + Tathagata Das + 2015-02-26 13:46:07 -0800 + Commit: 731a997, github.com/apache/spark/pull/4779 + + Modify default value description for spark.scheduler.minRegisteredResourcesRatio on docs. + Li Zhihui + 2015-02-26 13:07:07 -0800 + Commit: 62652dc, github.com/apache/spark/pull/4781 + + [SPARK-5363] Fix bug in PythonRDD: remove() inside iterator is not safe + Davies Liu + 2015-02-26 11:54:17 -0800 + Commit: 5d309ad, github.com/apache/spark/pull/4776 + + [SPARK-6015] fix links to source code in Python API docs + Davies Liu + 2015-02-26 10:45:29 -0800 + Commit: dafb3d2, github.com/apache/spark/pull/4772 + + [SPARK-6007][SQL] Add numRows param in DataFrame.show() + Jacky Li + 2015-02-26 10:40:58 -0800 + Commit: 7c779d8, github.com/apache/spark/pull/4767 + + [SPARK-6016][SQL] Cannot read the parquet table after overwriting the existing table when spark.sql.parquet.cacheMetadata=true + Yin Huai + 2015-02-27 01:01:32 +0800 + Commit: b5c5e93, github.com/apache/spark/pull/4775 + + [SPARK-6023][SQL] ParquetConversions fails to replace the destination MetastoreRelation of an InsertIntoTable node to ParquetRelation2 + Yin Huai + 2015-02-26 22:39:49 +0800 + Commit: e0f5fb0, github.com/apache/spark/pull/4782 + + [SPARK-5976][MLLIB] Add partitioner to factors returned by ALS + Xiangrui Meng + 2015-02-25 23:43:29 -0800 + Commit: a51d9db, github.com/apache/spark/pull/4748 + + [SPARK-1182][Docs] Sort the configuration parameters in configuration.md + Brennon York + 2015-02-25 16:12:56 -0800 + Commit: 56fa38a, github.com/apache/spark/pull/3863 + + [SPARK-5724] fix the misconfiguration in AkkaUtils + CodingCat + 2015-02-23 11:29:25 +0000 + Commit: b32a653, github.com/apache/spark/pull/4512 + + [SPARK-5974] [SPARK-5980] [mllib] [python] [docs] Update ML guide with save/load, Python GBT + Joseph K. Bradley + 2015-02-25 16:13:17 -0800 + Commit: a1b4856, github.com/apache/spark/pull/4750 + + [SPARK-5926] [SQL] make DataFrame.explain leverage queryExecution.logical + Yanbo Liang + 2015-02-25 15:37:13 -0800 + Commit: 5bd4b49, github.com/apache/spark/pull/4707 + + [SPARK-5999][SQL] Remove duplicate Literal matching block + Liang-Chi Hsieh + 2015-02-25 15:22:33 -0800 + Commit: 6fff9b8, github.com/apache/spark/pull/4760 + + [SPARK-6010] [SQL] Merging compatible Parquet schemas before computing splits + Cheng Lian + 2015-02-25 15:15:22 -0800 + Commit: 016f1f8, github.com/apache/spark/pull/4768 + + [SPARK-5944] [PySpark] fix version in Python API docs + Davies Liu + 2015-02-25 15:13:34 -0800 + Commit: 9aca3c6, github.com/apache/spark/pull/4731 + + [SPARK-5982] Remove incorrect Local Read Time Metric + Kay Ousterhout + 2015-02-25 14:55:24 -0800 + Commit: 791df93, github.com/apache/spark/pull/4749 + + [SPARK-1955][GraphX]: VertexRDD can incorrectly assume index sharing + Brennon York + 2015-02-25 14:11:12 -0800 + Commit: 8073767, github.com/apache/spark/pull/4705 + + SPARK-5930 [DOCS] Documented default of spark.shuffle.io.retryWait is confusing + Sean Owen + 2015-02-25 12:20:44 -0800 + Commit: eaffc6e, github.com/apache/spark/pull/4769 + + [SPARK-5996][SQL] Fix specialized outbound conversions + Michael Armbrust + 2015-02-25 10:13:40 -0800 + Commit: fada683, github.com/apache/spark/pull/4757 + + [SPARK-5994] [SQL] Python DataFrame documentation fixes + Davies Liu + 2015-02-24 20:51:55 -0800 + Commit: 5c421e0, github.com/apache/spark/pull/4756 + + [SPARK-5286][SQL] SPARK-5286 followup + Yin Huai + 2015-02-24 19:51:36 -0800 + Commit: e7a748e, github.com/apache/spark/pull/4755 + + [SPARK-5993][Streaming][Build] Fix assembly jar location of kafka-assembly + Tathagata Das + 2015-02-24 19:10:37 -0800 + Commit: 1e94894, github.com/apache/spark/pull/4753 + + [SPARK-5985][SQL] DataFrame sortBy -> orderBy in Python. + Reynold Xin + 2015-02-24 18:59:23 -0800 + Commit: 5e233b2, github.com/apache/spark/pull/4752 + + [SPARK-5904][SQL] DataFrame Java API test suites. + Reynold Xin + 2015-02-24 18:51:41 -0800 + Commit: 78a1781, github.com/apache/spark/pull/4751 + + [SPARK-5751] [SQL] [WIP] Revamped HiveThriftServer2Suite for robustness + Cheng Lian + 2015-02-25 08:34:55 +0800 + Commit: 17ee246, github.com/apache/spark/pull/4720 + + [SPARK-5973] [PySpark] fix zip with two RDDs with AutoBatchedSerializer + Davies Liu + 2015-02-24 14:50:00 -0800 + Commit: 91bf0f8, github.com/apache/spark/pull/4745 + + [SPARK-5952][SQL] Lock when using hive metastore client + Michael Armbrust + 2015-02-24 13:39:29 -0800 + Commit: 641423d, github.com/apache/spark/pull/4746 + + [MLLIB] Change x_i to y_i in Variance's user guide + Xiangrui Meng + 2015-02-24 11:38:59 -0800 + Commit: a4ff445, github.com/apache/spark/pull/4740 + + [SPARK-5965] Standalone Worker UI displays {{USER_JAR}} + Andrew Or + 2015-02-24 11:08:07 -0800 + Commit: eaf7bf9, github.com/apache/spark/pull/4739 + + [Spark-5967] [UI] Correctly clean JobProgressListener.stageIdToActiveJobIds + Tathagata Das + 2015-02-24 11:02:47 -0800 + Commit: 28dd53b, github.com/apache/spark/pull/4741 + + [SPARK-5532][SQL] Repartition should not use external rdd representation + Michael Armbrust + 2015-02-24 10:52:18 -0800 + Commit: e46096b, github.com/apache/spark/pull/4738 + + [SPARK-5910][SQL] Support for as in selectExpr + Michael Armbrust + 2015-02-24 10:49:51 -0800 + Commit: ba5d60d, github.com/apache/spark/pull/4736 + + [SPARK-5968] [SQL] Suppresses ParquetOutputCommitter WARN logs + Cheng Lian + 2015-02-24 10:45:38 -0800 + Commit: 2b562b0, github.com/apache/spark/pull/4744 + + [SPARK-5958][MLLIB][DOC] update block matrix user guide + Xiangrui Meng + 2015-02-23 22:08:44 -0800 + Commit: dd42558, github.com/apache/spark/pull/4737 + + [SPARK-5873][SQL] Allow viewing of partially analyzed plans in queryExecution + Michael Armbrust + 2015-02-23 17:34:54 -0800 + Commit: 2d7786e, github.com/apache/spark/pull/4684 + + [SPARK-5935][SQL] Accept MapType in the schema provided to a JSON dataset. + Yin Huai , Yin Huai + 2015-02-23 17:16:34 -0800 + Commit: 33ccad2, github.com/apache/spark/pull/4710 + + [SPARK-5912] [docs] [mllib] Small fixes to ChiSqSelector docs + Joseph K. Bradley + 2015-02-23 16:15:57 -0800 + Commit: ae97040, github.com/apache/spark/pull/4732 + + [MLLIB] SPARK-5912 Programming guide for feature selection + Alexander Ulanov + 2015-02-23 12:09:40 -0800 + Commit: 8355773, github.com/apache/spark/pull/4709 + + [SPARK-5939][MLLib] make FPGrowth example app take parameters + Jacky Li + 2015-02-23 08:47:28 -0800 + Commit: 33b9084, github.com/apache/spark/pull/4714 + + [SPARK-5943][Streaming] Update the test to use new API to reduce the warning + Saisai Shao + 2015-02-23 11:27:27 +0000 + Commit: 67b7f79, github.com/apache/spark/pull/4722 + + [EXAMPLES] fix typo. + Makoto Fukuhara + 2015-02-23 09:24:33 +0000 + Commit: f172387, github.com/apache/spark/pull/4724 + + Revert "[SPARK-4808] Removing minimum number of elements read before spill check" + Andrew Or + 2015-02-22 09:44:52 -0800 + Commit: 4186dd3 + + SPARK-5669 [BUILD] Reverse exclusion of JBLAS libs for 1.3 + Sean Owen + 2015-02-22 09:09:06 +0000 + Commit: eed7389, github.com/apache/spark/pull/4715 + + [DataFrame] [Typo] Fix the typo + Cheng Hao + 2015-02-22 08:56:30 +0000 + Commit: 04d3b32, github.com/apache/spark/pull/4717 + + [DOCS] Fix typo in API for custom InputFormats based on the “new” MapReduce API + Alexander + 2015-02-22 08:53:05 +0000 + Commit: c5a5c6f, github.com/apache/spark/pull/4718 + + [SPARK-5937][YARN] Fix ClientSuite to set YARN mode, so that the correct class is used in t... + Hari Shreedharan + 2015-02-21 10:01:01 -0800 + Commit: 76e3e65, github.com/apache/spark/pull/4711 + + SPARK-5841 [CORE] [HOTFIX 2] Memory leak in DiskBlockManager + Nishkam Ravi , nishkamravi2 , nravi + 2015-02-21 09:59:28 -0800 + Commit: 932338e, github.com/apache/spark/pull/4690 + + [SPARK-5909][SQL] Add a clearCache command to Spark SQL's cache manager + Yin Huai + 2015-02-20 16:20:02 +0800 + Commit: b9a6c5c, github.com/apache/spark/pull/4694 + + [SPARK-5898] [SPARK-5896] [SQL] [PySpark] create DataFrame from pandas and tuple/list + Davies Liu + 2015-02-20 15:35:05 -0800 + Commit: 913562a, github.com/apache/spark/pull/4679 + + [SPARK-5867] [SPARK-5892] [doc] [ml] [mllib] Doc cleanups for 1.3 release + Joseph K. Bradley + 2015-02-20 02:31:32 -0800 + Commit: 8c12f31, github.com/apache/spark/pull/4675 + + [SPARK-4808] Removing minimum number of elements read before spill check + mcheah + 2015-02-19 18:09:22 -0800 + Commit: 0382dcc, github.com/apache/spark/pull/4420 + + [SPARK-5900][MLLIB] make PIC and FPGrowth Java-friendly + Xiangrui Meng + 2015-02-19 18:06:16 -0800 + Commit: ba941ce, github.com/apache/spark/pull/4695 + + SPARK-5570: No docs stating that `new SparkConf().set("spark.driver.memory", ...) will not work + Ilya Ganelin + 2015-02-19 15:50:58 -0800 + Commit: c5f3b9e, github.com/apache/spark/pull/4665 + + SPARK-4682 [CORE] Consolidate various 'Clock' classes + Sean Owen + 2015-02-19 15:35:23 -0800 + Commit: bd49e8b, github.com/apache/spark/pull/4514 + + [Spark-5889] Remove pid file after stopping service. + Zhan Zhang + 2015-02-19 23:13:02 +0000 + Commit: ff8976e, github.com/apache/spark/pull/4676 + + [SPARK-5902] [ml] Made PipelineStage.transformSchema public instead of private to ml + Joseph K. Bradley + 2015-02-19 12:46:27 -0800 + Commit: 0c494cf, github.com/apache/spark/pull/4682 + + [SPARK-5904][SQL] DataFrame API fixes. + Reynold Xin + 2015-02-19 12:09:44 -0800 + Commit: 55d91d9, github.com/apache/spark/pull/4686 + + [SPARK-5825] [Spark Submit] Remove the double checking instance name when stopping the service + Cheng Hao + 2015-02-19 12:07:51 -0800 + Commit: fe00eb6, github.com/apache/spark/pull/4611 + + [SPARK-5423][Core] Cleanup resources in DiskMapIterator.finalize to ensure deleting the temp file + zsxwing + 2015-02-19 18:37:31 +0000 + Commit: 25fae8e, github.com/apache/spark/pull/4219 + + [SPARK-5816] Add huge compatibility warning in DriverWrapper + Andrew Or + 2015-02-19 09:56:25 -0800 + Commit: f93d4d9, github.com/apache/spark/pull/4687 + + SPARK-5548: Fix for AkkaUtilsSuite failure - attempt 2 + Jacek Lewandowski + 2015-02-19 09:53:36 -0800 + Commit: fbcb949, github.com/apache/spark/pull/4653 + + [SPARK-5846] Correctly set job description and pool for SQL jobs + Kay Ousterhout + 2015-02-19 09:49:34 +0800 + Commit: 092b45f, github.com/apache/spark/pull/4630 + + [SPARK-5879][MLLIB] update PIC user guide and add a Java example + Xiangrui Meng + 2015-02-18 16:29:32 -0800 + Commit: a64f374, github.com/apache/spark/pull/4680 + + [SPARK-5722] [SQL] [PySpark] infer int as LongType + Davies Liu + 2015-02-18 14:17:04 -0800 + Commit: 470cba8, github.com/apache/spark/pull/4666 + + [SPARK-5840][SQL] HiveContext cannot be serialized due to tuple extraction + Reynold Xin + 2015-02-18 14:02:32 -0800 + Commit: b86e44c, github.com/apache/spark/pull/4628 + + [SPARK-5507] Added documentation for BlockMatrix + Burak Yavuz + 2015-02-18 10:11:08 -0800 + Commit: 56f8f29, github.com/apache/spark/pull/4664 + + [SPARK-5519][MLLIB] add user guide with example code for fp-growth + Xiangrui Meng + 2015-02-18 10:09:56 -0800 + Commit: 661fbd3, github.com/apache/spark/pull/4661 + + SPARK-5669 [BUILD] [HOTFIX] Spark assembly includes incompatibly licensed libgfortran, libgcc code via JBLAS + Sean Owen + 2015-02-18 14:41:44 +0000 + Commit: 9f256ce, github.com/apache/spark/pull/4673 + + SPARK-4610 addendum: [Minor] [MLlib] Minor doc fix in GBT classification example + MechCoder + 2015-02-18 10:13:28 +0000 + Commit: 3997e74, github.com/apache/spark/pull/4672 + + [SPARK-5878] fix DataFrame.repartition() in Python + Davies Liu + 2015-02-18 01:00:54 -0800 + Commit: aca7991, github.com/apache/spark/pull/4667 + + Avoid deprecation warnings in JDBCSuite. + Tor Myklebust + 2015-02-18 01:00:13 -0800 + Commit: 9a565b8, github.com/apache/spark/pull/4668 + + [Minor] [SQL] Cleans up DataFrame variable names and toDF() calls + Cheng Lian + 2015-02-17 23:36:20 -0800 + Commit: 2bd33ce, github.com/apache/spark/pull/4670 + + [SPARK-5731][Streaming][Test] Fix incorrect test in DirectKafkaStreamSuite + Tathagata Das + 2015-02-17 22:44:16 -0800 + Commit: f8f9a64, github.com/apache/spark/pull/4597 + + [SPARK-5723][SQL]Change the default file format to Parquet for CTAS statements. + Yin Huai + 2015-02-17 18:14:33 -0800 + Commit: 6e82c46, github.com/apache/spark/pull/4639 + + Preparing development version 1.3.1-SNAPSHOT + Patrick Wendell + 2015-02-18 01:52:06 +0000 + Commit: 2ab0ba0 + + Preparing Spark release v1.3.0-rc1 + Patrick Wendell + 2015-02-18 01:52:06 +0000 + Commit: f97b0d4 + + [SPARK-5875][SQL]logical.Project should not be resolved if it contains aggregates or generators + Yin Huai + 2015-02-17 17:50:39 -0800 + Commit: e8284b2, github.com/apache/spark/pull/4663 + + Revert "Preparing Spark release v1.3.0-snapshot1" + Patrick Wendell + 2015-02-17 17:48:47 -0800 + Commit: 7320605 + + Revert "Preparing development version 1.3.1-SNAPSHOT" + Patrick Wendell + 2015-02-17 17:48:43 -0800 + Commit: 932ae4d + + [SPARK-4454] Revert getOrElse() cleanup in DAGScheduler.getCacheLocs() + Josh Rosen + 2015-02-17 17:45:16 -0800 + Commit: 7e5e4d8 + + [SPARK-4454] Properly synchronize accesses to DAGScheduler cacheLocs map + Josh Rosen + 2015-02-17 17:39:58 -0800 + Commit: 07a401a, github.com/apache/spark/pull/4660 + + [SPARK-5811] Added documentation for maven coordinates and added Spark Packages support + Burak Yavuz , Davies Liu + 2015-02-17 17:15:43 -0800 + Commit: cb90584, github.com/apache/spark/pull/4662 + + [SPARK-5785] [PySpark] narrow dependency for cogroup/join in PySpark + Davies Liu + 2015-02-17 16:54:57 -0800 + Commit: 8120235, github.com/apache/spark/pull/4629 + + [SPARK-5852][SQL]Fail to convert a newly created empty metastore parquet table to a data source parquet table. + Yin Huai , Cheng Hao + 2015-02-17 15:47:59 -0800 + Commit: 07d8ef9, github.com/apache/spark/pull/4655 + + [SPARK-5872] [SQL] create a sqlCtx in pyspark shell + Davies Liu + 2015-02-17 15:44:37 -0800 + Commit: 0dba382, github.com/apache/spark/pull/4659 + + [SPARK-5871] output explain in Python + Davies Liu + 2015-02-17 13:48:38 -0800 + Commit: cb06160, github.com/apache/spark/pull/4658 + + [SPARK-4172] [PySpark] Progress API in Python + Davies Liu + 2015-02-17 13:36:43 -0800 + Commit: 35e23ff, github.com/apache/spark/pull/3027 + + [SPARK-5868][SQL] Fix python UDFs in HiveContext and checks in SQLContext + Michael Armbrust + 2015-02-17 13:23:45 -0800 + Commit: e65dc1f, github.com/apache/spark/pull/4657 + + [SQL] [Minor] Update the HiveContext Unittest + Cheng Hao + 2015-02-17 12:25:35 -0800 + Commit: 0135651, github.com/apache/spark/pull/4584 + + [Minor][SQL] Use same function to check path parameter in JSONRelation + Liang-Chi Hsieh + 2015-02-17 12:24:13 -0800 + Commit: d74d5e8, github.com/apache/spark/pull/4649 + + [SPARK-5862][SQL] Only transformUp the given plan once in HiveMetastoreCatalog + Liang-Chi Hsieh + 2015-02-17 12:23:18 -0800 + Commit: 62063b7, github.com/apache/spark/pull/4651 + + [Minor] fix typo in SQL document + CodingCat + 2015-02-17 12:16:52 -0800 + Commit: 5636c4a, github.com/apache/spark/pull/4656 + + [SPARK-5864] [PySpark] support .jar as python package + Davies Liu + 2015-02-17 12:05:06 -0800 + Commit: 71cf6e2, github.com/apache/spark/pull/4652 + + SPARK-5841 [CORE] [HOTFIX] Memory leak in DiskBlockManager + Sean Owen + 2015-02-17 19:40:06 +0000 + Commit: e64afcd, github.com/apache/spark/pull/4648 + + [SPARK-5661]function hasShutdownDeleteTachyonDir should use shutdownDeleteTachyonPaths to determine whether contains file + xukun 00228947 , viper-kun + 2015-02-17 18:59:41 +0000 + Commit: 420bc9b, github.com/apache/spark/pull/4418 + + [SPARK-5778] throw if nonexistent metrics config file provided + Ryan Williams + 2015-02-17 10:57:16 -0800 + Commit: 2bf2b56, github.com/apache/spark/pull/4571 + + [SPARK-5859] [PySpark] [SQL] fix DataFrame Python API + Davies Liu + 2015-02-17 10:22:48 -0800 + Commit: 4a581aa, github.com/apache/spark/pull/4645 + + [SPARK-5166][SPARK-5247][SPARK-5258][SQL] API Cleanup / Documentation + Michael Armbrust + 2015-02-17 10:21:17 -0800 + Commit: cd3d415, github.com/apache/spark/pull/4642 + + [SPARK-5858][MLLIB] Remove unnecessary first() call in GLM + Xiangrui Meng + 2015-02-17 10:17:45 -0800 + Commit: 97cb568, github.com/apache/spark/pull/4647 + + SPARK-5856: In Maven build script, launch Zinc with more memory + Patrick Wendell + 2015-02-17 10:10:01 -0800 + Commit: 8240629, github.com/apache/spark/pull/4643 + + Revert "[SPARK-5363] [PySpark] check ending mark in non-block way" + Josh Rosen + 2015-02-17 07:48:27 -0800 + Commit: aeb85cd + + [SPARK-5826][Streaming] Fix Configuration not serializable problem + jerryshao + 2015-02-17 10:45:18 +0000 + Commit: b8da5c3, github.com/apache/spark/pull/4612 + + HOTFIX: Style issue causing build break + Patrick Wendell + 2015-02-16 22:10:39 -0800 + Commit: e9241fa + + [SPARK-5802][MLLIB] cache transformed data in glm + Xiangrui Meng + 2015-02-16 22:09:04 -0800 + Commit: dfe0fa0, github.com/apache/spark/pull/4593 + + [SPARK-5853][SQL] Schema support in Row. + Reynold Xin + 2015-02-16 20:42:57 -0800 + Commit: d0701d9, github.com/apache/spark/pull/4640 + + SPARK-5850: Remove experimental label for Scala 2.11 and FlumePollingStream + Patrick Wendell + 2015-02-16 20:33:33 -0800 + Commit: c6a7069, github.com/apache/spark/pull/4638 + + [SPARK-5363] [PySpark] check ending mark in non-block way + Davies Liu + 2015-02-16 20:32:03 -0800 + Commit: baad6b3, github.com/apache/spark/pull/4601 + + [SQL] Various DataFrame doc changes. + Reynold Xin + 2015-02-16 19:00:30 -0800 + Commit: e355b54, github.com/apache/spark/pull/4636 + + [SPARK-5849] Handle more types of invalid JSON requests in SubmitRestProtocolMessage.parseAction + Josh Rosen + 2015-02-16 18:08:02 -0800 + Commit: 385a339, github.com/apache/spark/pull/4637 + + [SPARK-3340] Deprecate ADD_JARS and ADD_FILES + azagrebin + 2015-02-16 18:06:19 -0800 + Commit: d8c70fb, github.com/apache/spark/pull/4616 + + [SPARK-5788] [PySpark] capture the exception in python write thread + Davies Liu + 2015-02-16 17:57:14 -0800 + Commit: c2a9a61, github.com/apache/spark/pull/4577 + + SPARK-5848: tear down the ConsoleProgressBar timer + Matt Whelan + 2015-02-17 00:59:49 +0000 + Commit: 52994d8, github.com/apache/spark/pull/4635 + + [SPARK-4865][SQL]Include temporary tables in SHOW TABLES + Yin Huai + 2015-02-16 15:59:23 -0800 + Commit: 8a94bf7, github.com/apache/spark/pull/4618 + + [SQL] Optimize arithmetic and predicate operators + kai + 2015-02-16 15:58:05 -0800 + Commit: 639a3c2, github.com/apache/spark/pull/4472 + + [SPARK-5839][SQL]HiveMetastoreCatalog does not recognize table names and aliases of data source tables. + Yin Huai + 2015-02-16 15:54:01 -0800 + Commit: a15a0a0, github.com/apache/spark/pull/4626 + + [SPARK-5746][SQL] Check invalid cases for the write path of data source API + Yin Huai + 2015-02-16 15:51:59 -0800 + Commit: 4198654, github.com/apache/spark/pull/4617 + + HOTFIX: Break in Jekyll build from #4589 + Patrick Wendell + 2015-02-16 15:43:56 -0800 + Commit: ad8fd4f + + [SPARK-2313] Use socket to communicate GatewayServer port back to Python driver + Josh Rosen + 2015-02-16 15:25:11 -0800 + Commit: b70b8ba, github.com/apache/spark/pull/3424. + + SPARK-5357: Update commons-codec version to 1.10 (current) + Matt Whelan + 2015-02-16 23:05:34 +0000 + Commit: 8c45619, github.com/apache/spark/pull/4153 + + SPARK-5841: remove DiskBlockManager shutdown hook on stop + Matt Whelan + 2015-02-16 22:54:32 +0000 + Commit: dd977df, github.com/apache/spark/pull/4627 + + [SPARK-5833] [SQL] Adds REFRESH TABLE command + Cheng Lian + 2015-02-16 12:52:05 -0800 + Commit: 864d77e, github.com/apache/spark/pull/4624 + + [SPARK-5296] [SQL] Add more filter types for data sources API + Cheng Lian + 2015-02-16 12:48:55 -0800 + Commit: 363a9a7, github.com/apache/spark/pull/4623 + + [SQL] Add fetched row count in SparkSQLCLIDriver + OopsOutOfMemory + 2015-02-16 12:34:09 -0800 + Commit: 0368494, github.com/apache/spark/pull/4604 + + [SQL] Initial support for reporting location of error in sql string + Michael Armbrust + 2015-02-16 12:32:56 -0800 + Commit: 63fa123, github.com/apache/spark/pull/4587 + + [SPARK-5824] [SQL] add null format in ctas and set default col comment to null + Daoyuan Wang + 2015-02-16 12:31:36 -0800 + Commit: c2eaaea, github.com/apache/spark/pull/4609 + + [SQL] [Minor] Update the SpecificMutableRow.copy + Cheng Hao + 2015-02-16 12:21:08 -0800 + Commit: 1a88955, github.com/apache/spark/pull/4619 + + SPARK-5795 [STREAMING] api.java.JavaPairDStream.saveAsNewAPIHadoopFiles may not friendly to java + Sean Owen + 2015-02-16 19:32:31 +0000 + Commit: fef2267, github.com/apache/spark/pull/4608 + + [SPARK-5799][SQL] Compute aggregation function on specified numeric columns + Liang-Chi Hsieh + 2015-02-16 10:06:11 -0800 + Commit: 0165e9d, github.com/apache/spark/pull/4592 + + [SPARK-4553] [SPARK-5767] [SQL] Wires Parquet data source with the newly introduced write support for data source API + Cheng Lian + 2015-02-16 01:38:31 -0800 + Commit: 78f7edb, github.com/apache/spark/pull/4563 + + [Minor] [SQL] Renames stringRddToDataFrame to stringRddToDataFrameHolder for consistency + Cheng Lian + 2015-02-16 01:33:37 -0800 + Commit: 066301c, github.com/apache/spark/pull/4613 + + [Ml] SPARK-5804 Explicitly manage cache in Crossvalidator k-fold loop + Peter Rudenko + 2015-02-16 00:07:23 -0800 + Commit: 0d93205, github.com/apache/spark/pull/4595 + + [Ml] SPARK-5796 Don't transform data on a last estimator in Pipeline + Peter Rudenko + 2015-02-15 20:51:32 -0800 + Commit: 9cf7d70, github.com/apache/spark/pull/4590 + + SPARK-5815 [MLLIB] Deprecate SVDPlusPlus APIs that expose DoubleMatrix from JBLAS + Sean Owen + 2015-02-15 20:41:27 -0800 + Commit: db3c539, github.com/apache/spark/pull/4614 + + [SPARK-5769] Set params in constructors and in setParams in Python ML pipelines + Xiangrui Meng + 2015-02-15 20:29:26 -0800 + Commit: d710991, github.com/apache/spark/pull/4564 + + SPARK-5669 [BUILD] Spark assembly includes incompatibly licensed libgfortran, libgcc code via JBLAS + Sean Owen + 2015-02-15 09:15:48 -0800 + Commit: 4e099d7, github.com/apache/spark/pull/4453 + + [MLLIB][SPARK-5502] User guide for isotonic regression + martinzapletal + 2015-02-15 09:10:03 -0800 + Commit: d96e188, github.com/apache/spark/pull/4536 + + [HOTFIX] Ignore DirectKafkaStreamSuite. + Patrick Wendell + 2015-02-13 12:43:53 -0800 + Commit: 70ebad4 + + [SPARK-5827][SQL] Add missing import in the example of SqlContext + Takeshi Yamamuro + 2015-02-15 14:42:20 +0000 + Commit: 9c1c70d, github.com/apache/spark/pull/4615 + + SPARK-5822 [BUILD] cannot import src/main/scala & src/test/scala into eclipse as source folder + gli + 2015-02-14 20:43:27 +0000 + Commit: f87f3b7, github.com/apache/spark/pull/4531 + + Revise formatting of previous commit f80e2629bb74bc62960c61ff313f7e7802d61319 + Sean Owen + 2015-02-14 20:12:29 +0000 + Commit: 1945fcf + + [SPARK-5800] Streaming Docs. Change linked files according the selected language + gasparms + 2015-02-14 20:10:29 +0000 + Commit: e99e170, github.com/apache/spark/pull/4589 + + [SPARK-5752][SQL] Don't implicitly convert RDDs directly to DataFrames + Reynold Xin , Davies Liu + 2015-02-13 23:03:22 -0800 + Commit: ba91bf5, github.com/apache/spark/pull/4556 + + SPARK-3290 [GRAPHX] No unpersist callls in SVDPlusPlus + Sean Owen + 2015-02-13 20:12:52 -0800 + Commit: db57479, github.com/apache/spark/pull/4234 + + [SPARK-5227] [SPARK-5679] Disable FileSystem cache in WholeTextFileRecordReaderSuite + Josh Rosen + 2015-02-13 17:45:31 -0800 + Commit: 152147f, github.com/apache/spark/pull/4599 + + [SPARK-5730][ML] add doc groups to spark.ml components + Xiangrui Meng + 2015-02-13 16:45:59 -0800 + Commit: fccd38d, github.com/apache/spark/pull/4600 + + [SPARK-5803][MLLIB] use ArrayBuilder to build primitive arrays + Xiangrui Meng + 2015-02-13 16:43:49 -0800 + Commit: 356b798, github.com/apache/spark/pull/4594 + + [SPARK-5806] re-organize sections in mllib-clustering.md + Xiangrui Meng + 2015-02-13 15:09:27 -0800 + Commit: 9658763, github.com/apache/spark/pull/4598 + + [SPARK-5789][SQL]Throw a better error message if JsonRDD.parseJson encounters unrecoverable parsing errors. + Yin Huai + 2015-02-13 13:51:06 -0800 + Commit: d9d0250, github.com/apache/spark/pull/4582 + + [SPARK-5642] [SQL] Apply column pruning on unused aggregation fields + Daoyuan Wang , Michael Armbrust + 2015-02-13 13:46:50 -0800 + Commit: efffc2e, github.com/apache/spark/pull/4415 + + [HOTFIX] Fix build break in MesosSchedulerBackendSuite + Andrew Or + 2015-02-13 13:10:29 -0800 + Commit: 4160371 + + SPARK-5805 Fixed the type error in documentation. + Emre Sevinç + 2015-02-13 12:31:27 -0800 + Commit: ad73189, github.com/apache/spark/pull/4596 + + [SPARK-5735] Replace uses of EasyMock with Mockito + Josh Rosen + 2015-02-13 09:53:57 -0800 + Commit: cc9eec1, github.com/apache/spark/pull/4578 + + [SPARK-5783] Better eventlog-parsing error messages + Ryan Williams + 2015-02-13 09:47:26 -0800 + Commit: e5690a5, github.com/apache/spark/pull/4573 + + [SPARK-5503][MLLIB] Example code for Power Iteration Clustering + sboeschhuawei + 2015-02-13 09:45:57 -0800 + Commit: 5e63942, github.com/apache/spark/pull/4495 + + [SPARK-5732][CORE]:Add an option to print the spark version in spark script. + uncleGen , genmao.ygm + 2015-02-13 09:43:10 -0800 + Commit: 5c883df, github.com/apache/spark/pull/4522 + + [SPARK-4832][Deploy]some other processes might take the daemon pid + WangTaoTheTonic , WangTaoTheTonic + 2015-02-13 10:27:23 +0000 + Commit: 1255e83, github.com/apache/spark/pull/3683 + + [SQL] Fix docs of SQLContext.tables + Yin Huai + 2015-02-12 20:37:55 -0800 + Commit: a8f560c, github.com/apache/spark/pull/4579 + + [SPARK-3365][SQL]Wrong schema generated for List type + tianyi + 2015-02-12 22:18:39 -0800 + Commit: b9f332a, github.com/apache/spark/pull/4581 + + [SPARK-3299][SQL]Public API in SQLContext to list tables + Yin Huai + 2015-02-12 18:08:01 -0800 + Commit: edbac17, github.com/apache/spark/pull/4547 + + [SQL] Move SaveMode to SQL package. + Yin Huai + 2015-02-12 15:32:17 -0800 + Commit: 925fd84, github.com/apache/spark/pull/4542 + + [SPARK-5335] Fix deletion of security groups within a VPC + Vladimir Grigor , Vladimir Grigor + 2015-02-12 23:26:24 +0000 + Commit: 5c9db4e, github.com/apache/spark/pull/4122 + + [SPARK-5755] [SQL] remove unnecessary Add + Daoyuan Wang + 2015-02-12 15:22:07 -0800 + Commit: f7103b3, github.com/apache/spark/pull/4551 + + [SPARK-5573][SQL] Add explode to dataframes + Michael Armbrust + 2015-02-12 15:19:19 -0800 + Commit: c7eb9ee, github.com/apache/spark/pull/4546 + + [SPARK-5758][SQL] Use LongType as the default type for integers in JSON schema inference. + Yin Huai + 2015-02-12 15:17:25 -0800 + Commit: b0c79da, github.com/apache/spark/pull/4544 + + [SPARK-5780] [PySpark] Mute the logging during unit tests + Davies Liu + 2015-02-12 14:54:38 -0800 + Commit: bf0d15c, github.com/apache/spark/pull/4572 + + SPARK-5747: Fix wordsplitting bugs in make-distribution.sh + David Y. Ross + 2015-02-12 14:52:38 -0800 + Commit: 11a0d5b, github.com/apache/spark/pull/4540 + + [SPARK-5759][Yarn]ExecutorRunnable should catch YarnException while NMClient start contain... + lianhuiwang + 2015-02-12 14:50:16 -0800 + Commit: 02d5b32, github.com/apache/spark/pull/4554 + + [SPARK-5760][SPARK-5761] Fix standalone rest protocol corner cases + revamp tests + Andrew Or + 2015-02-12 14:47:52 -0800 + Commit: 11d1080, github.com/apache/spark/pull/4557 + + [SPARK-5762] Fix shuffle write time for sort-based shuffle + Kay Ousterhout + 2015-02-12 14:46:37 -0800 + Commit: 0040fc5, github.com/apache/spark/pull/4559 + + [SPARK-5765][Examples]Fixed word split problem in run-example and compute-classpath + Venkata Ramana G , Venkata Ramana Gollamudi + 2015-02-12 14:44:21 -0800 + Commit: 9a1de4b, github.com/apache/spark/pull/4561 + + [SPARK-5645] Added local read bytes/time to task metrics + Kay Ousterhout + 2015-02-12 14:35:44 -0800 + Commit: 74f34bb, github.com/apache/spark/pull/4510 + + [SQL] Improve error messages + Michael Armbrust , wangfei + 2015-02-12 13:11:28 -0800 + Commit: e3a975d, github.com/apache/spark/pull/4558 + + [SQL][DOCS] Update sql documentation + Antonio Navarro Perez + 2015-02-12 12:46:17 -0800 + Commit: cbd659e, github.com/apache/spark/pull/4560 + + [SPARK-5757][MLLIB] replace SQL JSON usage in model import/export by json4s + Xiangrui Meng + 2015-02-12 10:48:13 -0800 + Commit: e26c149, github.com/apache/spark/pull/4555 + + [SPARK-5655] Don't chmod700 application files if running in YARN + Andrew Rowson + 2015-02-12 18:41:39 +0000 + Commit: e23c8f5, github.com/apache/spark/pull/4509 + + [SQL] Make dataframe more tolerant of being serialized + Michael Armbrust + 2015-02-11 19:05:49 -0800 + Commit: 3c1b9bf, github.com/apache/spark/pull/4545 + + [SQL] Two DataFrame fixes. + Reynold Xin + 2015-02-11 18:32:48 -0800 + Commit: bcb1382, github.com/apache/spark/pull/4543 + + [SPARK-3688][SQL] More inline comments for LogicalPlan. + Reynold Xin + 2015-02-11 15:26:31 -0800 + Commit: 08ab3d2, github.com/apache/spark/pull/4539 + + [SPARK-3688][SQL]LogicalPlan can't resolve column correctlly + tianyi + 2015-02-11 12:50:17 -0800 + Commit: e136f47, github.com/apache/spark/pull/4524 + + [SPARK-5454] More robust handling of self joins + Michael Armbrust + 2015-02-11 12:31:56 -0800 + Commit: 1bb3631, github.com/apache/spark/pull/4520 + + Remove outdated remark about take(n). + Daniel Darabos + 2015-02-11 20:24:17 +0000 + Commit: 72adfc5, github.com/apache/spark/pull/4533 + + [SPARK-5677] [SPARK-5734] [SQL] [PySpark] Python DataFrame API remaining tasks + Davies Liu + 2015-02-11 12:13:16 -0800 + Commit: d66aae2, github.com/apache/spark/pull/4528 + + [SPARK-5733] Error Link in Pagination of HistroyPage when showing Incomplete Applications + guliangliang + 2015-02-11 15:55:49 +0000 + Commit: 864dccd, github.com/apache/spark/pull/4523 + + SPARK-5727 [BUILD] Deprecate Debian packaging + Sean Owen + 2015-02-11 08:30:16 +0000 + Commit: 057ec4f, github.com/apache/spark/pull/4516 + + SPARK-5728 [STREAMING] MQTTStreamSuite leaves behind ActiveMQ database files + Sean Owen + 2015-02-11 08:13:51 +0000 + Commit: 476b6d7, github.com/apache/spark/pull/4517 + + [SPARK-4964] [Streaming] refactor createRDD to take leaders via map instead of array + cody koeninger + 2015-02-11 00:13:27 -0800 + Commit: 811d179, github.com/apache/spark/pull/4511 + + Preparing development version 1.3.1-SNAPSHOT + Patrick Wendell + 2015-02-11 07:47:03 +0000 + Commit: e57c81b + + Preparing Spark release v1.3.0-snapshot1 + Patrick Wendell + 2015-02-11 07:47:02 +0000 + Commit: d97bfc6 + + Revert "Preparing Spark release v1.3.0-snapshot1" + Patrick Wendell + 2015-02-10 23:46:04 -0800 + Commit: 6a91d59 + + Revert "Preparing development version 1.3.1-SNAPSHOT" + Patrick Wendell + 2015-02-10 23:46:02 -0800 + Commit: 3a50383 + + HOTFIX: Adding Junit to Hive tests for Maven build + Patrick Wendell + 2015-02-10 23:39:21 -0800 + Commit: 0386fc4 + + Preparing development version 1.3.1-SNAPSHOT + Patrick Wendell + 2015-02-11 06:45:03 +0000 + Commit: ba12b79 + + Preparing Spark release v1.3.0-snapshot1 + Patrick Wendell + 2015-02-11 06:45:03 +0000 + Commit: 53068f5 + + HOTFIX: Java 6 compilation error in Spark SQL + Patrick Wendell + 2015-02-10 22:43:32 -0800 + Commit: 15180bc + + Revert "Preparing Spark release v1.3.0-snapshot1" + Patrick Wendell + 2015-02-10 22:44:10 -0800 + Commit: 536dae9 + + Revert "Preparing development version 1.3.1-SNAPSHOT" + Patrick Wendell + 2015-02-10 22:44:07 -0800 + Commit: 01b562e + + Preparing development version 1.3.1-SNAPSHOT + Patrick Wendell + 2015-02-11 06:15:29 +0000 + Commit: db80d0f + + Preparing Spark release v1.3.0-snapshot1 + Patrick Wendell + 2015-02-11 06:15:29 +0000 + Commit: c2e4001 + + Updating versions for Spark 1.3 + Patrick Wendell + 2015-02-10 21:54:55 -0800 + Commit: 2f52489 + + [SPARK-5714][Mllib] Refactor initial step of LDA to remove redundant operations + Liang-Chi Hsieh + 2015-02-10 21:51:15 -0800 + Commit: ba3aa8f, github.com/apache/spark/pull/4501 + + [SPARK-5702][SQL] Allow short names for built-in data sources. + Reynold Xin + 2015-02-10 20:40:21 -0800 + Commit: 63af90c, github.com/apache/spark/pull/4489 + + [SPARK-5729] Potential NPE in standalone REST API + Andrew Or + 2015-02-10 20:19:14 -0800 + Commit: 1bc75b0, github.com/apache/spark/pull/4518 + + [SPARK-4879] Use driver to coordinate Hadoop output committing for speculative tasks + mcheah , Josh Rosen + 2015-02-10 20:12:18 -0800 + Commit: 79cd59c, github.com/apache/spark/pull/4155. + + [SQL][DataFrame] Fix column computability bug. + Reynold Xin + 2015-02-10 19:50:44 -0800 + Commit: e477e91, github.com/apache/spark/pull/4519 + + [SPARK-5709] [SQL] Add EXPLAIN support in DataFrame API for debugging purpose + Cheng Hao + 2015-02-10 19:40:51 -0800 + Commit: 7fa0d5f, github.com/apache/spark/pull/4496 + + [SPARK-5704] [SQL] [PySpark] createDataFrame from RDD with columns + Davies Liu + 2015-02-10 19:40:12 -0800 + Commit: 1056c5b, github.com/apache/spark/pull/4498 + + [SPARK-5683] [SQL] Avoid multiple json generator created + Cheng Hao + 2015-02-10 18:19:56 -0800 + Commit: fc0446f, github.com/apache/spark/pull/4468 + + [SQL] Add an exception for analysis errors. + Michael Armbrust + 2015-02-10 17:32:42 -0800 + Commit: 748cdc1, github.com/apache/spark/pull/4439 + + [SPARK-5658][SQL] Finalize DDL and write support APIs + Yin Huai + 2015-02-10 17:29:52 -0800 + Commit: a21090e, github.com/apache/spark/pull/4446 + + [SPARK-5493] [core] Add option to impersonate user. + Marcelo Vanzin + 2015-02-10 17:19:10 -0800 + Commit: 8e75b0e, github.com/apache/spark/pull/4405 + + [SQL] Make Options in the data source API CREATE TABLE statements optional. + Yin Huai + 2015-02-10 17:06:12 -0800 + Commit: 445dbc7, github.com/apache/spark/pull/4515 + + [SPARK-5725] [SQL] Fixes ParquetRelation2.equals + Cheng Lian + 2015-02-10 17:02:44 -0800 + Commit: f43bc3d, github.com/apache/spark/pull/4513 + + [SPARK-5343][GraphX]: ShortestPaths traverses backwards + Brennon York + 2015-02-10 14:57:00 -0800 + Commit: 5be8902, github.com/apache/spark/pull/4478 + + [SPARK-5021] [MLlib] Gaussian Mixture now supports Sparse Input + MechCoder + 2015-02-10 14:05:55 -0800 + Commit: bba0953, github.com/apache/spark/pull/4459 + + [HOTFIX][SPARK-4136] Fix compilation and tests + Andrew Or + 2015-02-10 11:18:01 -0800 + Commit: 4e3aa68 + + [SPARK-5686][SQL] Add show current roles command in HiveQl + OopsOutOfMemory + 2015-02-10 13:20:15 -0800 + Commit: 8b7587a, github.com/apache/spark/pull/4471 + + [SQL] Add toString to DataFrame/Column + Michael Armbrust + 2015-02-10 13:14:01 -0800 + Commit: ef739d9, github.com/apache/spark/pull/4436 + + SPARK-5613: Catch the ApplicationNotFoundException exception to avoid thread from getting killed on yarn restart. + Kashish Jain + 2015-02-06 13:47:23 -0800 + Commit: c294216, github.com/apache/spark/pull/4392 + + [SPARK-5592][SQL] java.net.URISyntaxException when insert data to a partitioned table + wangfei , Fei Wang + 2015-02-10 11:54:30 -0800 + Commit: dbfce30, github.com/apache/spark/pull/4368 + + SPARK-4136. Under dynamic allocation, cancel outstanding executor requests when no longer needed + Sandy Ryza + 2015-02-10 11:07:25 -0800 + Commit: e53da21, github.com/apache/spark/pull/4168 + + [SPARK-5716] [SQL] Support TOK_CHARSETLITERAL in HiveQl + Daoyuan Wang + 2015-02-10 11:08:21 -0800 + Commit: e508237, github.com/apache/spark/pull/4502 + + [Spark-5717] [MLlib] add stop and reorganize import + JqueryFan , Yuhao Yang + 2015-02-10 17:37:32 +0000 + Commit: b32f553, github.com/apache/spark/pull/4503 + + [SPARK-5700] [SQL] [Build] Bumps jets3t to 0.9.3 for hadoop-2.3 and hadoop-2.4 profiles + Cheng Lian + 2015-02-10 02:28:47 -0800 + Commit: d6f31e0, github.com/apache/spark/pull/4499 + + SPARK-5239 [CORE] JdbcRDD throws "java.lang.AbstractMethodError: oracle.jdbc.driver.xxxxxx.isClosed()Z" + Sean Owen + 2015-02-10 09:19:01 +0000 + Commit: 4cfc025, github.com/apache/spark/pull/4470 + + [SPARK-4964][Streaming][Kafka] More updates to Exactly-once Kafka stream + Tathagata Das + 2015-02-09 22:45:48 -0800 + Commit: 281614d, github.com/apache/spark/pull/4384 + + [SPARK-5597][MLLIB] save/load for decision trees and emsembles + Joseph K. Bradley , Xiangrui Meng + 2015-02-09 22:09:07 -0800 + Commit: 01905c4, github.com/apache/spark/pull/4444. + + [SQL] Remove the duplicated code + Cheng Hao + 2015-02-09 21:33:34 -0800 + Commit: 663d34e, github.com/apache/spark/pull/4494 + + [SPARK-5701] Only set ShuffleReadMetrics when task has shuffle deps + Kay Ousterhout + 2015-02-09 21:22:09 -0800 + Commit: 6ddbca4, github.com/apache/spark/pull/4488 + + [SPARK-5703] AllJobsPage throws empty.max exception + Andrew Or + 2015-02-09 21:18:48 -0800 + Commit: 8326255, github.com/apache/spark/pull/4490 + + [SPARK-2996] Implement userClassPathFirst for driver, yarn. + Marcelo Vanzin + 2015-02-09 21:17:06 -0800 + Commit: 6a1e0f9, github.com/apache/spark/pull/3233 + + SPARK-4900 [MLLIB] MLlib SingularValueDecomposition ARPACK IllegalStateException + Sean Owen + 2015-02-09 21:13:58 -0800 + Commit: ebf1df0, github.com/apache/spark/pull/4485 + + Add a config option to print DAG. + KaiXinXiaoLei + 2015-02-09 20:58:58 -0800 + Commit: dad05e0, github.com/apache/spark/pull/4257 + + [SPARK-5469] restructure pyspark.sql into multiple files + Davies Liu + 2015-02-09 20:49:22 -0800 + Commit: f0562b4, github.com/apache/spark/pull/4479 + + [SPARK-5698] Do not let user request negative # of executors + Andrew Or + 2015-02-09 17:33:29 -0800 + Commit: 62b1e1f, github.com/apache/spark/pull/4483 + + [SPARK-5699] [SQL] [Tests] Runs hive-thriftserver tests whenever SQL code is modified + Cheng Lian + 2015-02-09 16:52:05 -0800 + Commit: 71f0f51, github.com/apache/spark/pull/4486 + + [SPARK-5648][SQL] support "alter ... unset tblproperties("key")" + DoingDone9 <799203320@qq.com> + 2015-02-09 16:40:26 -0800 + Commit: e2bf59a, github.com/apache/spark/pull/4424 + + [SPARK-2096][SQL] support dot notation on array of struct + Wenchen Fan + 2015-02-09 16:39:34 -0800 + Commit: 15f557f, github.com/apache/spark/pull/2405 + + [SPARK-5614][SQL] Predicate pushdown through Generate. + Lu Yan + 2015-02-09 16:25:38 -0800 + Commit: ce2c89c, github.com/apache/spark/pull/4394 + + [SPARK-5696] [SQL] [HOTFIX] Asks HiveThriftServer2 to re-initialize log4j using Hive configurations + Cheng Lian + 2015-02-09 16:23:12 -0800 + Commit: 379233c, github.com/apache/spark/pull/4484 + + [SQL] Code cleanup. + Yin Huai + 2015-02-09 16:20:42 -0800 + Commit: e241601, github.com/apache/spark/pull/4482 + + [SQL] Add some missing DataFrame functions. + Michael Armbrust + 2015-02-09 16:02:56 -0800 + Commit: a70dca0, github.com/apache/spark/pull/4437 + + [SPARK-5675][SQL] XyzType companion object should subclass XyzType + Reynold Xin + 2015-02-09 14:51:46 -0800 + Commit: 1e2fab2, github.com/apache/spark/pull/4463 + + [SPARK-4905][STREAMING] FlumeStreamSuite fix. + Hari Shreedharan + 2015-02-09 14:17:14 -0800 + Commit: 18c5a99, github.com/apache/spark/pull/4371 + + [SPARK-5691] Fixing wrong data structure lookup for dupe app registratio... + mcheah + 2015-02-09 13:20:14 -0800 + Commit: 6a0144c, github.com/apache/spark/pull/4477 + + [SPARK-5678] Convert DataFrame to pandas.DataFrame and Series + Davies Liu + 2015-02-09 11:42:52 -0800 + Commit: 43972b5, github.com/apache/spark/pull/4476 + + [SPARK-5664][BUILD] Restore stty settings when exiting from SBT's spark-shell + Liang-Chi Hsieh + 2015-02-09 11:45:12 -0800 + Commit: fa67877, github.com/apache/spark/pull/4451 + + SPARK-4267 [YARN] Failing to launch jobs on Spark on YARN with Hadoop 2.5.0 or later + Sean Owen + 2015-02-09 10:33:57 -0800 + Commit: c88d4ab, github.com/apache/spark/pull/4452 + + [SPARK-5473] [EC2] Expose SSH failures after status checks pass + Nicholas Chammas + 2015-02-09 09:44:53 +0000 + Commit: f2aa7b7, github.com/apache/spark/pull/4262 + + [SPARK-5539][MLLIB] LDA guide + Xiangrui Meng , Joseph K. Bradley + 2015-02-08 23:40:36 -0800 + Commit: 5782ee2, github.com/apache/spark/pull/4465 + + [SPARK-5472][SQL] Fix Scala code style + Hung Lin + 2015-02-08 22:36:42 -0800 + Commit: 955f286, github.com/apache/spark/pull/4464 + + SPARK-4405 [MLLIB] Matrices.* construction methods should check for rows x cols overflow + Sean Owen + 2015-02-08 21:08:50 -0800 + Commit: fa8ea48, github.com/apache/spark/pull/4461 + + [SPARK-5660][MLLIB] Make Matrix apply public + Joseph K. Bradley , Xiangrui Meng + 2015-02-08 21:07:36 -0800 + Commit: df9b105, github.com/apache/spark/pull/4447 + + [SPARK-5643][SQL] Add a show method to print the content of a DataFrame in tabular format. + Reynold Xin + 2015-02-08 18:56:51 -0800 + Commit: e1996aa, github.com/apache/spark/pull/4416 + + SPARK-5665 [DOCS] Update netlib-java documentation + Sam Halliday , Sam Halliday + 2015-02-08 16:34:26 -0800 + Commit: c515634, github.com/apache/spark/pull/4448 + + [SPARK-5598][MLLIB] model save/load for ALS + Xiangrui Meng + 2015-02-08 16:26:20 -0800 + Commit: 9e4d58f, github.com/apache/spark/pull/4422 + + [SQL] Set sessionState in QueryExecution. + Yin Huai + 2015-02-08 14:55:07 -0800 + Commit: 42c56b6, github.com/apache/spark/pull/4445 + + [SPARK-3039] [BUILD] Spark assembly for new hadoop API (hadoop 2) contai... + medale + 2015-02-08 10:35:29 +0000 + Commit: bc55e20, github.com/apache/spark/pull/4315 + + [SPARK-5672][Web UI] Don't return `ERROR 500` when have missing args + Kirill A. Korinskiy + 2015-02-08 10:31:46 +0000 + Commit: 96010fa, github.com/apache/spark/pull/4239 + + [SPARK-5671] Upgrade jets3t to 0.9.2 in hadoop-2.3 and 2.4 profiles + Josh Rosen + 2015-02-07 17:19:08 -0800 + Commit: 0f9d765, github.com/apache/spark/pull/4454 + + [SPARK-5108][BUILD] Jackson dependency management for Hadoop-2.6.0 support + Zhan Zhang + 2015-02-07 19:41:30 +0000 + Commit: 51fbca4, github.com/apache/spark/pull/3938 + + [BUILD] Add the ability to launch spark-shell from SBT. + Michael Armbrust + 2015-02-07 00:14:38 -0800 + Commit: 6bda169, github.com/apache/spark/pull/4438 + + [SPARK-5388] Provide a stable application submission gateway for standalone cluster mode + Andrew Or + 2015-02-06 15:57:06 -0800 + Commit: 6ec0cdc, github.com/apache/spark/pull/4216 + + SPARK-5403: Ignore UserKnownHostsFile in SSH calls + Grzegorz Dubicki + 2015-02-06 15:43:58 -0800 + Commit: 3d99741, github.com/apache/spark/pull/4196 + + [SPARK-5601][MLLIB] make streaming linear algorithms Java-friendly + Xiangrui Meng + 2015-02-06 15:42:59 -0800 + Commit: 11b28b9, github.com/apache/spark/pull/4432 + + [SQL] [Minor] HiveParquetSuite was disabled by mistake, re-enable them + Cheng Lian + 2015-02-06 15:23:42 -0800 + Commit: 4005802, github.com/apache/spark/pull/4440 + + [SQL] Use TestSQLContext in Java tests + Michael Armbrust + 2015-02-06 15:11:02 -0800 + Commit: c950058, github.com/apache/spark/pull/4441 + + [SPARK-4994][network]Cleanup removed executors' ShuffleInfo in yarn shuffle service + lianhuiwang + 2015-02-06 14:47:52 -0800 + Commit: af6ddf8, github.com/apache/spark/pull/3828 + + [SPARK-5444][Network]Add a retry to deal with the conflict port in netty server. + huangzhaowei + 2015-02-06 14:35:29 -0800 + Commit: caca15a, github.com/apache/spark/pull/4240 + + [SPARK-4874] [CORE] Collect record count metrics + Kostas Sakellis + 2015-02-06 14:31:20 -0800 + Commit: 9fa29a6, github.com/apache/spark/pull/4067 + + [HOTFIX] Fix the maven build after adding sqlContext to spark-shell + Michael Armbrust + 2015-02-06 14:27:06 -0800 + Commit: 11dbf71, github.com/apache/spark/pull/4443 + + [SPARK-5600] [core] Clean up FsHistoryProvider test, fix app sort order. + Marcelo Vanzin + 2015-02-06 14:23:09 -0800 + Commit: 09feecc, github.com/apache/spark/pull/4370 + + SPARK-5633 pyspark saveAsTextFile support for compression codec + Vladimir Vladimirov + 2015-02-06 13:55:02 -0800 + Commit: 1d32341, github.com/apache/spark/pull/4403 + + [HOTFIX][MLLIB] fix a compilation error with java 6 + Xiangrui Meng + 2015-02-06 13:52:35 -0800 + Commit: 87e0f0d, github.com/apache/spark/pull/4442 + + [SPARK-4983] Insert waiting time before tagging EC2 instances + GenTang , Gen TANG + 2015-02-06 13:27:34 -0800 + Commit: 2872d83, github.com/apache/spark/pull/3986 + + [SPARK-5586][Spark Shell][SQL] Make `sqlContext` available in spark shell + OopsOutOfMemory + 2015-02-06 13:20:10 -0800 + Commit: 2ef9853, github.com/apache/spark/pull/4387 + + [SPARK-5278][SQL] Introduce UnresolvedGetField and complete the check of ambiguous reference to fields + Wenchen Fan + 2015-02-06 13:08:09 -0800 + Commit: 1b148ad, github.com/apache/spark/pull/4068 + + [SQL][Minor] Remove cache keyword in SqlParser + wangfei + 2015-02-06 12:42:23 -0800 + Commit: d822606, github.com/apache/spark/pull/4393 + + [SQL][HiveConsole][DOC] HiveConsole `correct hiveconsole imports` + OopsOutOfMemory + 2015-02-06 12:41:28 -0800 + Commit: 2abaa6e, github.com/apache/spark/pull/4389 + + [SPARK-5595][SPARK-5603][SQL] Add a rule to do PreInsert type casting and field renaming and invalidating in memory cache after INSERT + Yin Huai + 2015-02-06 12:38:07 -0800 + Commit: 3c34d62, github.com/apache/spark/pull/4373 + + [SPARK-5324][SQL] Results of describe can't be queried + OopsOutOfMemory , Sheng, Li + 2015-02-06 12:33:20 -0800 + Commit: 0fc35da, github.com/apache/spark/pull/4249 + + [SPARK-5619][SQL] Support 'show roles' in HiveContext + q00251598 + 2015-02-06 12:29:26 -0800 + Commit: cc66a3c, github.com/apache/spark/pull/4397 + + [SPARK-5640] Synchronize ScalaReflection where necessary + Tobias Schlatter + 2015-02-06 12:15:02 -0800 + Commit: 779e28b, github.com/apache/spark/pull/4431 + + [SPARK-5650][SQL] Support optional 'FROM' clause + Liang-Chi Hsieh + 2015-02-06 12:13:44 -0800 + Commit: 921121d, github.com/apache/spark/pull/4426 + + [SPARK-5628] Add version option to spark-ec2 + Nicholas Chammas + 2015-02-06 12:08:22 -0800 + Commit: ab0ffde, github.com/apache/spark/pull/4414 + + [SPARK-2945][YARN][Doc]add doc for spark.executor.instances + WangTaoTheTonic + 2015-02-06 11:57:02 -0800 + Commit: 540f474, github.com/apache/spark/pull/4350 + + [SPARK-4361][Doc] Add more docs for Hadoop Configuration + zsxwing + 2015-02-06 11:50:20 -0800 + Commit: 528dd34, github.com/apache/spark/pull/3225 + + [HOTFIX] Fix test build break in ExecutorAllocationManagerSuite. + Josh Rosen + 2015-02-06 11:47:32 -0800 + Commit: 9e828f4 + + [SPARK-5652][Mllib] Use broadcasted weights in LogisticRegressionModel + Liang-Chi Hsieh + 2015-02-06 11:22:11 -0800 + Commit: 6fda4c1, github.com/apache/spark/pull/4429 + + [SPARK-5555] Enable UISeleniumSuite tests + Josh Rosen + 2015-02-06 11:14:58 -0800 + Commit: 93fee7b, github.com/apache/spark/pull/4334 + + SPARK-2450 Adds executor log links to Web UI + Kostas Sakellis , Josh Rosen + 2015-02-06 11:13:00 -0800 + Commit: e74dd04, github.com/apache/spark/pull/3486 + + [SPARK-5618][Spark Core][Minor] Optimise utility code. + Makoto Fukuhara + 2015-02-06 11:11:38 -0800 + Commit: 3feb798, github.com/apache/spark/pull/4396 + + [SPARK-5593][Core]Replace BlockManagerListener with ExecutorListener in ExecutorAllocationListener + lianhuiwang + 2015-02-06 11:09:37 -0800 + Commit: 9387dc1, github.com/apache/spark/pull/4369 + + [SPARK-4877] Allow user first classes to extend classes in the parent. + Stephen Haberman + 2015-02-06 11:03:56 -0800 + Commit: 52386cf, github.com/apache/spark/pull/3725 + + [SPARK-5396] Syntax error in spark scripts on windows. + Masayoshi TSUZUKI + 2015-02-06 10:58:26 -0800 + Commit: 2dc94cd, github.com/apache/spark/pull/4428 + + [SPARK-5636] Ramp up faster in dynamic allocation + Andrew Or + 2015-02-06 10:54:23 -0800 + Commit: 0a90305, github.com/apache/spark/pull/4409 + + SPARK-4337. [YARN] Add ability to cancel pending requests + Sandy Ryza + 2015-02-06 10:53:16 -0800 + Commit: 1568391, github.com/apache/spark/pull/4141 + + [SPARK-5416] init Executor.threadPool before ExecutorSource + Ryan Williams + 2015-02-06 12:22:25 +0000 + Commit: f9bc4ef, github.com/apache/spark/pull/4212 + + [Build] Set all Debian package permissions to 755 + Nicholas Chammas + 2015-02-06 11:38:39 +0000 + Commit: 3638216, github.com/apache/spark/pull/4277 + + Update ec2-scripts.md + Miguel Peralvo + 2015-02-06 11:04:48 +0000 + Commit: f6613fc, github.com/apache/spark/pull/4300 + + [SPARK-5470][Core]use defaultClassLoader to load classes in KryoSerializer + lianhuiwang + 2015-02-06 11:00:35 +0000 + Commit: 8007a4f, github.com/apache/spark/pull/4258 + + [SPARK-5653][YARN] In ApplicationMaster rename isDriver to isClusterMode + lianhuiwang + 2015-02-06 10:48:31 -0800 + Commit: 4ff8855, github.com/apache/spark/pull/4430 + + [SPARK-5582] [history] Ignore empty log directories. + Marcelo Vanzin + 2015-02-06 10:07:20 +0000 + Commit: faccdcb, github.com/apache/spark/pull/4352 + + [SPARK-5157][YARN] Configure more JVM options properly when we use ConcMarkSweepGC for AM. + Kousuke Saruta + 2015-02-06 09:39:12 +0000 + Commit: 25d8044, github.com/apache/spark/pull/3956 + + [Minor] Remove permission for execution from spark-shell.cmd + Kousuke Saruta + 2015-02-06 09:33:36 +0000 + Commit: 7c54681, github.com/apache/spark/pull/3983 + + [SPARK-5380][GraphX] Solve an ArrayIndexOutOfBoundsException when build graph with a file format error + Leolh + 2015-02-06 09:01:53 +0000 + Commit: ffdb2e9, github.com/apache/spark/pull/4176 + + [SPARK-5013] [MLlib] Added documentation and sample data file for GaussianMixture + Travis Galoppo + 2015-02-06 10:26:51 -0800 + Commit: f408db6, github.com/apache/spark/pull/4401 + + [SPARK-4789] [SPARK-4942] [SPARK-5031] [mllib] Standardize ML Prediction APIs + Joseph K. Bradley + 2015-02-05 23:43:47 -0800 + Commit: 45b95e7, github.com/apache/spark/pull/3637 + + [SPARK-5604][MLLIB] remove checkpointDir from trees + Xiangrui Meng + 2015-02-05 23:32:09 -0800 + Commit: c35a11e, github.com/apache/spark/pull/4407 + + [SPARK-5639][SQL] Support DataFrame.renameColumn. + Reynold Xin + 2015-02-05 23:02:40 -0800 + Commit: 0639d3e, github.com/apache/spark/pull/4410 + + Revert "SPARK-5607: Update to Kryo 2.24.0 to avoid including objenesis 1.2." + Patrick Wendell + 2015-02-05 18:37:55 -0800 + Commit: 6d31531 + + SPARK-5557: Explicitly include servlet API in dependencies. + Patrick Wendell + 2015-02-05 18:14:54 -0800 + Commit: 34131fd, github.com/apache/spark/pull/4411 + + [HOTFIX] [SQL] Disables Metastore Parquet table conversion for "SQLQuerySuite.CTAS with serde" + Cheng Lian + 2015-02-05 18:09:18 -0800 + Commit: ce6d8bb, github.com/apache/spark/pull/4413 + + [SPARK-5638][SQL] Add a config flag to disable eager analysis of DataFrames + Reynold Xin + 2015-02-05 18:07:10 -0800 + Commit: 4fd67e4, github.com/apache/spark/pull/4408 + + [SPARK-5620][DOC] group methods in generated unidoc + Xiangrui Meng + 2015-02-05 16:26:51 -0800 + Commit: e2be79d, github.com/apache/spark/pull/4404 + + [SPARK-5182] [SPARK-5528] [SPARK-5509] [SPARK-3575] [SQL] Parquet data source improvements + Cheng Lian + 2015-02-05 15:29:56 -0800 + Commit: 50c48eb, github.com/apache/spark/pull/4308 + + [SPARK-5604[MLLIB] remove checkpointDir from LDA + Xiangrui Meng + 2015-02-05 15:07:33 -0800 + Commit: 59798cb, github.com/apache/spark/pull/4390 + + [SPARK-5460][MLlib] Wrapped `Try` around `deleteAllCheckpoints` - RandomForest. + x1- + 2015-02-05 15:02:04 -0800 + Commit: 44768f5, github.com/apache/spark/pull/4347 + + [SPARK-5135][SQL] Add support for describe table to DDL in SQLContext + OopsOutOfMemory + 2015-02-05 13:07:48 -0800 + Commit: 55cebcf, github.com/apache/spark/pull/4227 + + [SPARK-5617][SQL] fix test failure of SQLQuerySuite + wangfei + 2015-02-05 12:44:12 -0800 + Commit: 785a2e3, github.com/apache/spark/pull/4395 + + [Branch-1.3] [DOC] doc fix for date + Daoyuan Wang + 2015-02-05 12:42:27 -0800 + Commit: 17ef7f9, github.com/apache/spark/pull/4400 + + [SPARK-5474][Build]curl should support URL redirection in build/mvn + GuoQiang Li + 2015-02-05 12:03:13 -0800 + Commit: d1066e9, github.com/apache/spark/pull/4263 + + [HOTFIX] MLlib build break. + Reynold Xin + 2015-02-05 00:42:50 -0800 + Commit: c83d118 + + SPARK-5548: Fixed a race condition in AkkaUtilsSuite + Jacek Lewandowski + 2015-02-05 12:00:04 -0800 + Commit: fba2dc6, github.com/apache/spark/pull/4343 + + [SPARK-5608] Improve SEO of Spark documentation pages + Matei Zaharia + 2015-02-05 11:12:50 -0800 + Commit: de112a2, github.com/apache/spark/pull/4381 + + SPARK-4687. Add a recursive option to the addFile API + Sandy Ryza + 2015-02-05 10:15:55 -0800 + Commit: c22ccc0, github.com/apache/spark/pull/3670 + + [MLlib] Minor: UDF style update. + Reynold Xin + 2015-02-04 23:57:53 -0800 + Commit: 4074674, github.com/apache/spark/pull/4388 + + [SPARK-5612][SQL] Move DataFrame implicit functions into SQLContext.implicits. + Reynold Xin + 2015-02-04 23:44:34 -0800 + Commit: 0040b61, github.com/apache/spark/pull/4386 + + [SPARK-5606][SQL] Support plus sign in HiveContext + q00251598 + 2015-02-04 23:16:01 -0800 + Commit: bf43781, github.com/apache/spark/pull/4378 + + [SPARK-5599] Check MLlib public APIs for 1.3 + Xiangrui Meng + 2015-02-04 23:03:47 -0800 + Commit: abc184e, github.com/apache/spark/pull/4377 + + [SPARK-5596] [mllib] ML model import/export for GLMs, NaiveBayes + Joseph K. Bradley + 2015-02-04 22:46:48 -0800 + Commit: 885bcbb, github.com/apache/spark/pull/4233 + + SPARK-5607: Update to Kryo 2.24.0 to avoid including objenesis 1.2. + Patrick Wendell + 2015-02-04 22:39:44 -0800 + Commit: 59fb5c7, github.com/apache/spark/pull/4383 + + [SPARK-5602][SQL] Better support for creating DataFrame from local data collection + Reynold Xin + 2015-02-04 19:53:57 -0800 + Commit: b8f9c00, github.com/apache/spark/pull/4372 + + [SPARK-5538][SQL] Fix flaky CachedTableSuite + Reynold Xin + 2015-02-04 19:52:41 -0800 + Commit: 1901b19, github.com/apache/spark/pull/4379 + + [SQL][DataFrame] Minor cleanup. + Reynold Xin + 2015-02-04 19:51:48 -0800 + Commit: f05bfa6, github.com/apache/spark/pull/4374 + + [SPARK-4520] [SQL] This pr fixes the ArrayIndexOutOfBoundsException as r... + Sadhan Sood + 2015-02-04 19:18:06 -0800 + Commit: aa6f4ca, github.com/apache/spark/pull/4148 + + [SPARK-5605][SQL][DF] Allow using String to specify colum name in DSL aggregate functions + Reynold Xin + 2015-02-04 18:35:51 -0800 + Commit: 478ee3f, github.com/apache/spark/pull/4376 + + [SPARK-5411] Allow SparkListeners to be specified in SparkConf and loaded when creating SparkContext + Josh Rosen + 2015-02-04 17:18:03 -0800 + Commit: 47e4d57, github.com/apache/spark/pull/4111 + + [SPARK-5577] Python udf for DataFrame + Davies Liu + 2015-02-04 15:55:09 -0800 + Commit: dc9ead9, github.com/apache/spark/pull/4351 + + [SPARK-5118][SQL] Fix: create table test stored as parquet as select .. + guowei2 + 2015-02-04 15:26:10 -0800 + Commit: 06da868, github.com/apache/spark/pull/3921 + + [SQL] Use HiveContext's sessionState in HiveMetastoreCatalog.hiveDefaultTableFilePath + Yin Huai + 2015-02-04 15:22:40 -0800 + Commit: cb4c3e5, github.com/apache/spark/pull/4355 + + [SQL] Correct the default size of TimestampType and expose NumericType + Yin Huai + 2015-02-04 15:14:49 -0800 + Commit: 513bb2c, github.com/apache/spark/pull/4314 + + [SQL][Hiveconsole] Bring hive console code up to date and update README.md + OopsOutOfMemory , Sheng, Li + 2015-02-04 15:13:54 -0800 + Commit: 2cdcfe3, github.com/apache/spark/pull/4330 + + [SPARK-5367][SQL] Support star expression in udfs + wangfei , scwf + 2015-02-04 15:12:07 -0800 + Commit: 8b803f6, github.com/apache/spark/pull/4353 + + [SPARK-5426][SQL] Add SparkSQL Java API helper methods. + kul + 2015-02-04 15:08:37 -0800 + Commit: 38ab92e, github.com/apache/spark/pull/4243 + + [SPARK-5587][SQL] Support change database owner + wangfei + 2015-02-04 14:35:12 -0800 + Commit: 7920791, github.com/apache/spark/pull/4357 + + [SPARK-5591][SQL] Fix NoSuchObjectException for CTAS + wangfei + 2015-02-04 14:33:07 -0800 + Commit: c79dd1e, github.com/apache/spark/pull/4365 + + [SPARK-4939] move to next locality when no pending tasks + Davies Liu + 2015-02-04 14:22:07 -0800 + Commit: f9bb3cb, github.com/apache/spark/pull/3779 + + [SPARK-4707][STREAMING] Reliable Kafka Receiver can lose data if the blo... + Hari Shreedharan + 2015-02-04 14:20:44 -0800 + Commit: 14c9f32, github.com/apache/spark/pull/3655 + + [SPARK-4964] [Streaming] Exactly-once semantics for Kafka + cody koeninger + 2015-02-04 12:06:34 -0800 + Commit: a119cae, github.com/apache/spark/pull/3798 + + [SPARK-5588] [SQL] support select/filter by SQL expression + Davies Liu + 2015-02-04 11:34:46 -0800 + Commit: 950a0d3, github.com/apache/spark/pull/4359 + + [SPARK-5585] Flaky test in MLlib python + Davies Liu + 2015-02-04 08:54:20 -0800 + Commit: 84c6273, github.com/apache/spark/pull/4358 + + [SPARK-5574] use given name prefix in dir + Imran Rashid + 2015-02-04 01:02:20 -0800 + Commit: 5d9278a, github.com/apache/spark/pull/4344 + + [Minor] Fix incorrect warning log + Liang-Chi Hsieh + 2015-02-04 00:52:41 -0800 + Commit: 316a4bb, github.com/apache/spark/pull/4360 + + [SPARK-5379][Streaming] Add awaitTerminationOrTimeout + zsxwing + 2015-02-04 00:40:28 -0800 + Commit: 4d3dbfd, github.com/apache/spark/pull/4171 + + [SPARK-5341] Use maven coordinates as dependencies in spark-shell and spark-submit + Burak Yavuz + 2015-02-03 22:39:17 -0800 + Commit: 3b7acd2, github.com/apache/spark/pull/4215 + + [SPARK-4939] revive offers periodically in LocalBackend + Davies Liu + 2015-02-03 22:30:23 -0800 + Commit: e196da8, github.com/apache/spark/pull/4147 + + [SPARK-4969][STREAMING][PYTHON] Add binaryRecords to streaming + freeman + 2015-02-03 22:24:30 -0800 + Commit: 9a33f89, github.com/apache/spark/pull/3803 + + [SPARK-5579][SQL][DataFrame] Support for project/filter using SQL expressions + Reynold Xin + 2015-02-03 22:15:35 -0800 + Commit: cb7f783, github.com/apache/spark/pull/4348 + + [FIX][MLLIB] fix seed handling in Python GMM + Xiangrui Meng + 2015-02-03 20:39:11 -0800 + Commit: 679228b, github.com/apache/spark/pull/4349 + + [SPARK-4795][Core] Redesign the "primitive type => Writable" implicit APIs to make them be activated automatically + zsxwing + 2015-02-03 20:17:12 -0800 + Commit: 5c63e05, github.com/apache/spark/pull/3642 + + [SPARK-5578][SQL][DataFrame] Provide a convenient way for Scala users to use UDFs + Reynold Xin + 2015-02-03 20:07:46 -0800 + Commit: b22d5b5, github.com/apache/spark/pull/4345 + + [SPARK-5520][MLlib] Make FP-Growth implementation take generic item types (WIP) + Jacky Li , Jacky Li , Xiangrui Meng + 2015-02-03 17:02:42 -0800 + Commit: 298ef5b, github.com/apache/spark/pull/4340 + + [SPARK-5554] [SQL] [PySpark] add more tests for DataFrame Python API + Davies Liu + 2015-02-03 16:01:56 -0800 + Commit: 4640623, github.com/apache/spark/pull/4331 + + [STREAMING] SPARK-4986 Wait for receivers to deregister and receiver job to terminate + Jesper Lundgren + 2015-02-03 14:53:39 -0800 + Commit: 092d4ba, github.com/apache/spark/pull/4338 + + [SPARK-5153][Streaming][Test] Increased timeout to deal with flaky KafkaStreamSuite + Tathagata Das + 2015-02-03 13:46:02 -0800 + Commit: d644bd9, github.com/apache/spark/pull/4342 + + [SPARK-4508] [SQL] build native date type to conform behavior to Hive + Daoyuan Wang + 2015-02-03 12:21:45 -0800 + Commit: 6e244cf, github.com/apache/spark/pull/4325 + + [SPARK-5383][SQL] Support alias for udtfs + wangfei , scwf , Fei Wang + 2015-02-03 12:16:31 -0800 + Commit: 5dbeb21, github.com/apache/spark/pull/4186 + + [SPARK-5550] [SQL] Support the case insensitive for UDF + Cheng Hao + 2015-02-03 12:12:26 -0800 + Commit: 654c992, github.com/apache/spark/pull/4326 + + [SPARK-4987] [SQL] parquet timestamp type support + Daoyuan Wang + 2015-02-03 12:06:06 -0800 + Commit: 67d5220, github.com/apache/spark/pull/3820 + + [SQL] DataFrame API update + Reynold Xin + 2015-02-03 10:34:56 -0800 + Commit: 4204a12, github.com/apache/spark/pull/4332 + + Minor: Fix TaskContext deprecated annotations. + Reynold Xin + 2015-02-03 10:34:16 -0800 + Commit: f7948f3, github.com/apache/spark/pull/4333 + + [SPARK-5549] Define TaskContext interface in Scala. + Reynold Xin + 2015-02-03 00:46:04 -0800 + Commit: bebf4c4, github.com/apache/spark/pull/4324 + + [SPARK-5551][SQL] Create type alias for SchemaRDD for source backward compatibility + Reynold Xin + 2015-02-03 00:29:23 -0800 + Commit: 523a935, github.com/apache/spark/pull/4327 + + [SQL][DataFrame] Remove DataFrameApi, ExpressionApi, and GroupedDataFrameApi + Reynold Xin + 2015-02-03 00:29:04 -0800 + Commit: 37df330, github.com/apache/spark/pull/4328 + + [minor] update streaming linear algorithms + Xiangrui Meng + 2015-02-03 00:14:43 -0800 + Commit: 659329f, github.com/apache/spark/pull/4329 + + [SPARK-1405] [mllib] Latent Dirichlet Allocation (LDA) using EM + Xiangrui Meng + 2015-02-02 23:57:35 -0800 + Commit: 980764f, github.com/apache/spark/pull/2388 + + [SPARK-5536] replace old ALS implementation by the new one + Xiangrui Meng + 2015-02-02 23:49:09 -0800 + Commit: 0cc7b88, github.com/apache/spark/pull/4321 + + [SPARK-5414] Add SparkFirehoseListener class for consuming all SparkListener events + Josh Rosen + 2015-02-02 23:35:07 -0800 + Commit: b8ebebe, github.com/apache/spark/pull/4210 + + [SPARK-5501][SPARK-5420][SQL] Write support for the data source API + Yin Huai + 2015-02-02 23:30:44 -0800 + Commit: 13531dd, github.com/apache/spark/pull/4294 + + [SPARK-5012][MLLib][PySpark]Python API for Gaussian Mixture Model + FlytxtRnD + 2015-02-02 23:04:55 -0800 + Commit: 50a1a87, github.com/apache/spark/pull/4059 + + [SPARK-3778] newAPIHadoopRDD doesn't properly pass credentials for secure hdfs + Thomas Graves + 2015-02-02 22:45:55 -0800 + Commit: c31c36c, github.com/apache/spark/pull/4292 + + [SPARK-4979][MLLIB] Streaming logisitic regression + freeman + 2015-02-02 22:42:15 -0800 + Commit: eb0da6c, github.com/apache/spark/pull/4306 + + [SPARK-5219][Core] Add locks to avoid scheduling race conditions + zsxwing + 2015-02-02 21:42:18 -0800 + Commit: c306555, github.com/apache/spark/pull/4019 + + [Doc] Minor: Fixes several formatting issues + Cheng Lian + 2015-02-02 21:14:21 -0800 + Commit: 60f67e7, github.com/apache/spark/pull/4316 + + SPARK-3996: Add jetty servlet and continuations. + Patrick Wendell + 2015-02-02 21:01:36 -0800 + Commit: 7930d2b, github.com/apache/spark/pull/4323 + + SPARK-5542: Decouple publishing, packaging, and tagging in release script + Patrick Wendell , Patrick Wendell + 2015-02-02 21:00:30 -0800 + Commit: 0ef38f5, github.com/apache/spark/pull/4319 + + [SPARK-5543][WebUI] Remove unused import JsonUtil from from JsonProtocol + nemccarthy + 2015-02-02 20:03:13 -0800 + Commit: cb39f12, github.com/apache/spark/pull/4320 + + [SPARK-5472][SQL] A JDBC data source for Spark SQL. + Tor Myklebust + 2015-02-02 19:50:14 -0800 + Commit: 8f471a6, github.com/apache/spark/pull/4261 + + [SPARK-5512][Mllib] Run the PIC algorithm with initial vector suggected by the PIC paper + Liang-Chi Hsieh + 2015-02-02 19:34:25 -0800 + Commit: 1bcd465, github.com/apache/spark/pull/4301 + + [SPARK-5154] [PySpark] [Streaming] Kafka streaming support in Python + Davies Liu , Tathagata Das + 2015-02-02 19:16:27 -0800 + Commit: 0561c45, github.com/apache/spark/pull/3715 + + [SQL] Improve DataFrame API error reporting + Reynold Xin , Davies Liu + 2015-02-02 19:01:47 -0800 + Commit: 554403f, github.com/apache/spark/pull/4296 + + Revert "[SPARK-4508] [SQL] build native date type to conform behavior to Hive" + Patrick Wendell + 2015-02-02 17:52:17 -0800 + Commit: eccb9fb + + Spark 3883: SSL support for HttpServer and Akka + Jacek Lewandowski , Jacek Lewandowski + 2015-02-02 17:18:54 -0800 + Commit: cfea300, github.com/apache/spark/pull/3571 + + [SPARK-5540] hide ALS.solveLeastSquares + Xiangrui Meng + 2015-02-02 17:10:01 -0800 + Commit: ef65cf0, github.com/apache/spark/pull/4318 + + [SPARK-5534] [graphx] Graph getStorageLevel fix + Joseph K. Bradley + 2015-02-02 17:02:29 -0800 + Commit: f133dec, github.com/apache/spark/pull/4317 + + [SPARK-5514] DataFrame.collect should call executeCollect + Reynold Xin + 2015-02-02 16:55:36 -0800 + Commit: 8aa3cff, github.com/apache/spark/pull/4313 + + [SPARK-5195][sql]Update HiveMetastoreCatalog.scala(override the MetastoreRelation's sameresult method only compare databasename and table name) + seayi <405078363@qq.com>, Michael Armbrust + 2015-02-02 16:06:52 -0800 + Commit: dca6faa, github.com/apache/spark/pull/3898 + + [SPARK-2309][MLlib] Multinomial Logistic Regression + DB Tsai + 2015-02-02 15:59:15 -0800 + Commit: b1aa8fe, github.com/apache/spark/pull/3833 + + [SPARK-5513][MLLIB] Add nonnegative option to ml's ALS + Xiangrui Meng + 2015-02-02 15:55:44 -0800 + Commit: 46d50f1, github.com/apache/spark/pull/4302 + + [SPARK-4508] [SQL] build native date type to conform behavior to Hive + Daoyuan Wang + 2015-02-02 15:49:22 -0800 + Commit: 1646f89, github.com/apache/spark/pull/3732 + + SPARK-5500. Document that feeding hadoopFile into a shuffle operation wi... + Sandy Ryza + 2015-02-02 14:52:46 -0800 + Commit: 8309349, github.com/apache/spark/pull/4293 + + [SPARK-5461] [graphx] Add isCheckpointed, getCheckpointedFiles methods to Graph + Joseph K. Bradley + 2015-02-02 14:34:48 -0800 + Commit: 842d000, github.com/apache/spark/pull/4253 + + SPARK-5425: Use synchronised methods in system properties to create SparkConf + Jacek Lewandowski + 2015-02-02 14:07:19 -0800 + Commit: 5a55261, github.com/apache/spark/pull/4222 + + Disabling Utils.chmod700 for Windows + Martin Weindel , mweindel + 2015-02-02 13:46:18 -0800 + Commit: bff65b5, github.com/apache/spark/pull/4299 + + Make sure only owner can read / write to directories created for the job. + Josh Rosen + 2015-01-21 14:38:14 -0800 + Commit: 52f5754 + + [HOTFIX] Add jetty references to build for YARN module. + Patrick Wendell + 2015-02-02 14:00:14 -0800 + Commit: 2321dd1 + + [SPARK-4631][streaming][FIX] Wait for a receiver to start before publishing test data. + Iulian Dragos + 2015-02-02 14:00:33 -0800 + Commit: e908322, github.com/apache/spark/pull/4270 + + [SPARK-5212][SQL] Add support of schema-less, custom field delimiter and SerDe for HiveQL transform + Liang-Chi Hsieh + 2015-02-02 13:53:55 -0800 + Commit: 683e938, github.com/apache/spark/pull/4014 + + [SPARK-5530] Add executor container to executorIdToContainer + Xutingjun <1039320815@qq.com> + 2015-02-02 12:37:51 -0800 + Commit: 62a93a1, github.com/apache/spark/pull/4309 + + [Docs] Fix Building Spark link text + Nicholas Chammas + 2015-02-02 12:33:49 -0800 + Commit: 3f941b6, github.com/apache/spark/pull/4312 + + [SPARK-5173]support python application running on yarn cluster mode + lianhuiwang , Wang Lianhui + 2015-02-02 12:32:28 -0800 + Commit: f5e6375, github.com/apache/spark/pull/3976 + + SPARK-4585. Spark dynamic executor allocation should use minExecutors as... + Sandy Ryza + 2015-02-02 12:27:08 -0800 + Commit: b2047b5, github.com/apache/spark/pull/4051 + + [MLLIB] SPARK-5491 (ex SPARK-1473): Chi-square feature selection + Alexander Ulanov + 2015-02-02 12:13:05 -0800 + Commit: c081b21, github.com/apache/spark/pull/1484 + + SPARK-5492. Thread statistics can break with older Hadoop versions + Sandy Ryza + 2015-02-02 00:54:06 -0800 + Commit: 6f34131, github.com/apache/spark/pull/4305 + + [SPARK-5478][UI][Minor] Add missing right parentheses + jerryshao + 2015-02-01 23:56:13 -0800 + Commit: 63dfe21, github.com/apache/spark/pull/4267 + + [SPARK-5353] Log failures in REPL class loading + Tobias Schlatter + 2015-02-01 21:43:49 -0800 + Commit: 9f0a6e1, github.com/apache/spark/pull/4130 + + [SPARK-3996]: Shade Jetty in Spark deliverables + Patrick Wendell + 2015-02-01 21:13:57 -0800 + Commit: a15f6e3, github.com/apache/spark/pull/4285 + + [SPARK-4001][MLlib] adding parallel FP-Growth algorithm for frequent pattern mining in MLlib + Jacky Li , Jacky Li , Xiangrui Meng + 2015-02-01 20:07:25 -0800 + Commit: 859f724, github.com/apache/spark/pull/2847 + + [Spark-5406][MLlib] LocalLAPACK mode in RowMatrix.computeSVD should have much smaller upper bound + Yuhao Yang + 2015-02-01 19:40:26 -0800 + Commit: d85cd4e, github.com/apache/spark/pull/4200 + + [SPARK-5465] [SQL] Fixes filter push-down for Parquet data source + Cheng Lian + 2015-02-01 18:52:39 -0800 + Commit: ec10032, github.com/apache/spark/pull/4255 + + [SPARK-5262] [SPARK-5244] [SQL] add coalesce in SQLParser and widen types for parameters of coalesce + Daoyuan Wang + 2015-02-01 18:51:38 -0800 + Commit: 8cf4a1f, github.com/apache/spark/pull/4057 + + [SPARK-5196][SQL] Support `comment` in Create Table Field DDL + OopsOutOfMemory + 2015-02-01 18:41:49 -0800 + Commit: 1b56f1d, github.com/apache/spark/pull/3999 + + [SPARK-1825] Make Windows Spark client work fine with Linux YARN cluster + Masayoshi TSUZUKI + 2015-02-01 18:26:28 -0800 + Commit: 7712ed5, github.com/apache/spark/pull/3943 + + [SPARK-5176] The thrift server does not support cluster mode + Tom Panning + 2015-02-01 17:57:31 -0800 + Commit: 1ca0a10, github.com/apache/spark/pull/4137 + + [SPARK-5155] Build fails with spark-ganglia-lgpl profile + Kousuke Saruta + 2015-02-01 17:53:56 -0800 + Commit: c80194b, github.com/apache/spark/pull/4303 + + [Minor][SQL] Little refactor DataFrame related codes + Liang-Chi Hsieh + 2015-02-01 17:52:18 -0800 + Commit: ef89b82, github.com/apache/spark/pull/4298 + + [SPARK-4859][Core][Streaming] Refactor LiveListenerBus and StreamingListenerBus + zsxwing + 2015-02-01 17:47:51 -0800 + Commit: 883bc88, github.com/apache/spark/pull/4006 + + [SPARK-5424][MLLIB] make the new ALS impl take generic ID types + Xiangrui Meng + 2015-02-01 14:13:31 -0800 + Commit: 4a17122, github.com/apache/spark/pull/4281 + + [SPARK-5207] [MLLIB] StandardScalerModel mean and variance re-use + Octavian Geagla + 2015-02-01 09:21:14 -0800 + Commit: bdb0680, github.com/apache/spark/pull/4140 + + [SPARK-5422] Add support for sending Graphite metrics via UDP + Ryan Williams + 2015-01-31 23:41:05 -0800 + Commit: 80bd715, github.com/apache/spark/pull/4218 + + SPARK-3359 [CORE] [DOCS] `sbt/sbt unidoc` doesn't work with Java 8 + Sean Owen + 2015-01-31 10:40:42 -0800 + Commit: c84d5a1, github.com/apache/spark/pull/4193 + + [SPARK-3975] Added support for BlockMatrix addition and multiplication + Burak Yavuz , Burak Yavuz , Burak Yavuz , Burak Yavuz , Burak Yavuz + 2015-01-31 00:47:30 -0800 + Commit: ef8974b, github.com/apache/spark/pull/4274 + + [MLLIB][SPARK-3278] Monotone (Isotonic) regression using parallel pool adjacent violators algorithm + martinzapletal , Xiangrui Meng , Martin Zapletal + 2015-01-31 00:46:02 -0800 + Commit: 34250a6, github.com/apache/spark/pull/3519 + + [SPARK-5307] Add a config option for SerializationDebugger. + Reynold Xin + 2015-01-31 00:06:36 -0800 + Commit: 6364083, github.com/apache/spark/pull/4297 + + [SQL] remove redundant field "childOutput" from execution.Aggregate, use child.output instead + kai + 2015-01-30 23:19:10 -0800 + Commit: f54c9f6, github.com/apache/spark/pull/4291 + + [SPARK-5307] SerializationDebugger + Reynold Xin + 2015-01-30 22:34:10 -0800 + Commit: 740a568, github.com/apache/spark/pull/4098 + + [SPARK-5504] [sql] convertToCatalyst should support nested arrays + Joseph K. Bradley + 2015-01-30 15:40:14 -0800 + Commit: e643de4, github.com/apache/spark/pull/4295 + + SPARK-5400 [MLlib] Changed name of GaussianMixtureEM to GaussianMixture + Travis Galoppo + 2015-01-30 15:32:25 -0800 + Commit: 9869773, github.com/apache/spark/pull/4290 + + [SPARK-4259][MLlib]: Add Power Iteration Clustering Algorithm with Gaussian Similarity Function + sboeschhuawei , Fan Jiang , Jiang Fan , Stephen Boesch , Xiangrui Meng + 2015-01-30 14:09:49 -0800 + Commit: f377431, github.com/apache/spark/pull/4254 + + [SPARK-5486] Added validate method to BlockMatrix + Burak Yavuz + 2015-01-30 13:59:10 -0800 + Commit: 6ee8338, github.com/apache/spark/pull/4279 + + [SPARK-5496][MLLIB] Allow both classification and Classification in Algo for trees. + Xiangrui Meng + 2015-01-30 10:08:07 -0800 + Commit: 0a95085, github.com/apache/spark/pull/4287 + + [MLLIB] SPARK-4846: throw a RuntimeException and give users hints to increase the minCount + Joseph J.C. Tang + 2015-01-30 10:07:26 -0800 + Commit: 54d9575, github.com/apache/spark/pull/4247 + + SPARK-5393. Flood of util.RackResolver log messages after SPARK-1714 + Sandy Ryza + 2015-01-30 11:31:54 -0600 + Commit: 254eaa4, github.com/apache/spark/pull/4192 + + [SPARK-5457][SQL] Add missing DSL for ApproxCountDistinct. + Takuya UESHIN + 2015-01-30 01:21:35 -0800 + Commit: 6f21dce, github.com/apache/spark/pull/4250 + + [SPARK-5094][MLlib] Add Python API for Gradient Boosted Trees + Kazuki Taniguchi + 2015-01-30 00:39:44 -0800 + Commit: bc1fc9b, github.com/apache/spark/pull/3951 + + [SPARK-5322] Added transpose functionality to BlockMatrix + Burak Yavuz + 2015-01-29 21:26:29 -0800 + Commit: dd4d84c, github.com/apache/spark/pull/4275 + + [SQL] Support df("*") to select all columns in a data frame. + Reynold Xin + 2015-01-29 19:09:08 -0800 + Commit: 80def9d, github.com/apache/spark/pull/4283 + + [SPARK-5462] [SQL] Use analyzed query plan in DataFrame.apply() + Josh Rosen + 2015-01-29 18:23:05 -0800 + Commit: 22271f9, github.com/apache/spark/pull/4282 + + [SPARK-5395] [PySpark] fix python process leak while coalesce() + Davies Liu + 2015-01-29 17:28:37 -0800 + Commit: 5c746ee, github.com/apache/spark/pull/4238 + + [SQL] DataFrame API improvements + Reynold Xin + 2015-01-29 17:24:00 -0800 + Commit: ce9c43b, github.com/apache/spark/pull/4280 + + Revert "[WIP] [SPARK-3996]: Shade Jetty in Spark deliverables" + Patrick Wendell + 2015-01-29 17:14:27 -0800 + Commit: d2071e8 + + remove 'return' + Yoshihiro Shimizu + 2015-01-29 16:55:00 -0800 + Commit: 5338772, github.com/apache/spark/pull/4268 + + [WIP] [SPARK-3996]: Shade Jetty in Spark deliverables + Patrick Wendell + 2015-01-29 16:31:19 -0800 + Commit: f240fe3, github.com/apache/spark/pull/4252 + + [SPARK-5464] Fix help() for Python DataFrame instances + Josh Rosen + 2015-01-29 16:23:20 -0800 + Commit: 0bb15f2, github.com/apache/spark/pull/4278 + + [SPARK-4296][SQL] Trims aliases when resolving and checking aggregate expressions + Yin Huai , Cheng Lian + 2015-01-29 15:49:34 -0800 + Commit: c00d517, github.com/apache/spark/pull/4010 + + [SPARK-5373][SQL] Literal in agg grouping expressions leads to incorrect result + wangfei + 2015-01-29 15:47:13 -0800 + Commit: c1b3eeb, github.com/apache/spark/pull/4169 + + [SPARK-5367][SQL] Support star expression in udf + wangfei , scwf + 2015-01-29 15:44:53 -0800 + Commit: fbaf9e0, github.com/apache/spark/pull/4163 + + [SPARK-4786][SQL]: Parquet filter pushdown for castable types + Yash Datta + 2015-01-29 15:42:23 -0800 + Commit: de221ea, github.com/apache/spark/pull/4156 + + [SPARK-5309][SQL] Add support for dictionaries in PrimitiveConverter for Strin... + Michael Davies + 2015-01-29 15:40:59 -0800 + Commit: 940f375, github.com/apache/spark/pull/4187 + + [SPARK-5429][SQL] Use javaXML plan serialization for Hive golden answers on Hive 0.13.1 + Liang-Chi Hsieh + 2015-01-29 15:28:22 -0800 + Commit: bce0ba1, github.com/apache/spark/pull/4223 + + [SPARK-5445][SQL] Consolidate Java and Scala DSL static methods. + Reynold Xin + 2015-01-29 15:13:09 -0800 + Commit: 7156322, github.com/apache/spark/pull/4276 + + [SPARK-5466] Add explicit guava dependencies where needed. + Marcelo Vanzin + 2015-01-29 13:00:45 -0800 + Commit: f9e5694, github.com/apache/spark/pull/4272 + + [SPARK-5477] refactor stat.py + Xiangrui Meng + 2015-01-29 10:11:44 -0800 + Commit: a3dc618, github.com/apache/spark/pull/4266 + + [SQL] Various DataFrame DSL update. + Reynold Xin + 2015-01-29 00:01:10 -0800 + Commit: 5ad78f6, github.com/apache/spark/pull/4260 + + [SPARK-3977] Conversion methods for BlockMatrix to other Distributed Matrices + Burak Yavuz + 2015-01-28 23:42:07 -0800 + Commit: a63be1a, github.com/apache/spark/pull/4256 + + [SPARK-5445][SQL] Made DataFrame dsl usable in Java + Reynold Xin + 2015-01-28 19:10:32 -0800 + Commit: 5b9760d, github.com/apache/spark/pull/4241 + + [SPARK-5430] move treeReduce and treeAggregate from mllib to core + Xiangrui Meng + 2015-01-28 17:26:03 -0800 + Commit: 4ee79c7, github.com/apache/spark/pull/4228 + + [SPARK-4586][MLLIB] Python API for ML pipeline and parameters + Xiangrui Meng , Davies Liu + 2015-01-28 17:14:23 -0800 + Commit: e80dc1c, github.com/apache/spark/pull/4151 + + [SPARK-5441][pyspark] Make SerDeUtil PairRDD to Python conversions more robust + Michael Nazario + 2015-01-28 13:55:01 -0800 + Commit: e023112, github.com/apache/spark/pull/4236 + + [SPARK-4387][PySpark] Refactoring python profiling code to make it extensible + Yandu Oppacher , Davies Liu + 2015-01-28 13:48:06 -0800 + Commit: 3bead67, github.com/apache/spark/pull/3255. + + [SPARK-5417] Remove redundant executor-id set() call + Ryan Williams + 2015-01-28 13:04:52 -0800 + Commit: a731314, github.com/apache/spark/pull/4213 + + [SPARK-5434] [EC2] Preserve spaces in EC2 path + Nicholas Chammas + 2015-01-28 12:56:03 -0800 + Commit: d44ee43, github.com/apache/spark/pull/4224 + + [SPARK-5437] Fix DriverSuite and SparkSubmitSuite timeout issues + Andrew Or + 2015-01-28 12:52:31 -0800 + Commit: 84b6ecd, github.com/apache/spark/pull/4230 + + [SPARK-4955]With executor dynamic scaling enabled,executor shoude be added or killed in yarn-cluster mode. + lianhuiwang + 2015-01-28 12:50:57 -0800 + Commit: 81f8f34, github.com/apache/spark/pull/3962 + + [SPARK-5440][pyspark] Add toLocalIterator to pyspark rdd + Michael Nazario + 2015-01-28 12:47:12 -0800 + Commit: 456c11f, github.com/apache/spark/pull/4237 + + SPARK-1934 [CORE] "this" reference escape to "selectorThread" during construction in ConnectionManager + Sean Owen + 2015-01-28 12:44:35 -0800 + Commit: 9b18009, github.com/apache/spark/pull/4225 + + [SPARK-5188][BUILD] make-distribution.sh should support curl, not only wget to get Tachyon + Kousuke Saruta + 2015-01-28 12:43:22 -0800 + Commit: e902dc4, github.com/apache/spark/pull/3988 + + SPARK-5458. Refer to aggregateByKey instead of combineByKey in docs + Sandy Ryza + 2015-01-28 12:41:23 -0800 + Commit: 406f6d3, github.com/apache/spark/pull/4251 + + [SPARK-5447][SQL] Replaced reference to SchemaRDD with DataFrame. + Reynold Xin + 2015-01-28 12:10:01 -0800 + Commit: c8e934e, github.com/apache/spark/pull/4242 + + [SPARK-5361]Multiple Java RDD <-> Python RDD conversions not working correctly + Winston Chen + 2015-01-28 11:08:44 -0800 + Commit: 453d799, github.com/apache/spark/pull/4146 + + [SPARK-5291][CORE] Add timestamp and reason why an executor is removed to SparkListenerExecutorAdded and SparkListenerExecutorRemoved + Kousuke Saruta + 2015-01-28 11:02:51 -0800 + Commit: 0b35fcd, github.com/apache/spark/pull/4082 + + [SPARK-3974][MLlib] Distributed Block Matrix Abstractions + Burak Yavuz , Xiangrui Meng , Burak Yavuz , Burak Yavuz , Burak Yavuz + 2015-01-28 10:06:37 -0800 + Commit: eeb53bf, github.com/apache/spark/pull/3200 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-01-28 02:15:14 -0800 + Commit: 622ff09, github.com/apache/spark/pull/1480 + + [SPARK-5415] bump sbt to version to 0.13.7 + Ryan Williams + 2015-01-28 02:13:06 -0800 + Commit: 661d3f9, github.com/apache/spark/pull/4211 + + [SPARK-4809] Rework Guava library shading. + Marcelo Vanzin + 2015-01-28 00:29:29 -0800 + Commit: 37a5e27, github.com/apache/spark/pull/3658 + + [SPARK-5097][SQL] Test cases for DataFrame expressions. + Reynold Xin + 2015-01-27 18:10:49 -0800 + Commit: d743732, github.com/apache/spark/pull/4235 + + [SPARK-5097][SQL] DataFrame + Reynold Xin , Davies Liu + 2015-01-27 16:08:24 -0800 + Commit: 119f45d, github.com/apache/spark/pull/4173 + + SPARK-5199. FS read metrics should support CombineFileSplits and track bytes from all FSs + Sandy Ryza + 2015-01-27 15:42:55 -0800 + Commit: b1b35ca, github.com/apache/spark/pull/4050 + + [MLlib] fix python example of ALS in guide + Davies Liu + 2015-01-27 15:33:01 -0800 + Commit: fdaad4e, github.com/apache/spark/pull/4226 + + SPARK-5308 [BUILD] MD5 / SHA1 hash format doesn't match standard Maven output + Sean Owen + 2015-01-27 10:22:50 -0800 + Commit: ff356e2, github.com/apache/spark/pull/4161 + + [SPARK-5321] Support for transposing local matrices + Burak Yavuz + 2015-01-27 01:46:17 -0800 + Commit: 9142674, github.com/apache/spark/pull/4109 + + [SPARK-5419][Mllib] Fix the logic in Vectors.sqdist + Liang-Chi Hsieh + 2015-01-27 01:29:14 -0800 + Commit: 7b0ed79, github.com/apache/spark/pull/4217 + + [SPARK-3726] [MLlib] Allow sampling_rate not equal to 1.0 in RandomForests + MechCoder + 2015-01-26 19:46:17 -0800 + Commit: d6894b1, github.com/apache/spark/pull/4073 + + [SPARK-5119] java.lang.ArrayIndexOutOfBoundsException on trying to train... + lewuathe + 2015-01-26 18:03:21 -0800 + Commit: f2ba5c6, github.com/apache/spark/pull/3975 + + [SPARK-5052] Add common/base classes to fix guava methods signatures. + Elmer Garduno + 2015-01-26 17:40:48 -0800 + Commit: 661e0fc, github.com/apache/spark/pull/3874 + + SPARK-960 [CORE] [TEST] JobCancellationSuite "two jobs sharing the same stage" is broken + Sean Owen + 2015-01-26 14:32:27 -0800 + Commit: 0497ea5, github.com/apache/spark/pull/4180 + + Fix command spaces issue in make-distribution.sh + David Y. Ross + 2015-01-26 14:26:10 -0800 + Commit: b38034e, github.com/apache/spark/pull/4126 + + SPARK-4147 [CORE] Reduce log4j dependency + Sean Owen + 2015-01-26 14:23:42 -0800 + Commit: 54e7b45, github.com/apache/spark/pull/4190 + + [SPARK-5339][BUILD] build/mvn doesn't work because of invalid URL for maven's tgz. + Kousuke Saruta + 2015-01-26 13:07:49 -0800 + Commit: c094c73, github.com/apache/spark/pull/4124 + + [SPARK-5355] use j.u.c.ConcurrentHashMap instead of TrieMap + Davies Liu + 2015-01-26 12:51:32 -0800 + Commit: 1420931, github.com/apache/spark/pull/4208 + + [SPARK-5384][mllib] Vectors.sqdist returns inconsistent results for sparse/dense vectors when the vectors have different lengths + Yuhao Yang + 2015-01-25 22:18:09 -0800 + Commit: 8125168, github.com/apache/spark/pull/4183 + + [SPARK-5268] don't stop CoarseGrainedExecutorBackend for irrelevant DisassociatedEvent + CodingCat + 2015-01-25 19:28:53 -0800 + Commit: 8df9435, github.com/apache/spark/pull/4063 + + SPARK-4430 [STREAMING] [TEST] Apache RAT Checks fail spuriously on test files + Sean Owen + 2015-01-25 19:16:44 -0800 + Commit: 0528b85, github.com/apache/spark/pull/4189 + + [SPARK-5326] Show fetch wait time as optional metric in the UI + Kay Ousterhout + 2015-01-25 16:48:26 -0800 + Commit: fc2168f, github.com/apache/spark/pull/4110 + + [SPARK-5344][WebUI] HistoryServer cannot recognize that inprogress file was renamed to completed file + Kousuke Saruta + 2015-01-25 15:34:20 -0800 + Commit: 8f5c827, github.com/apache/spark/pull/4132 + + SPARK-4506 [DOCS] Addendum: Update more docs to reflect that standalone works in cluster mode + Sean Owen + 2015-01-25 15:25:05 -0800 + Commit: 9f64357, github.com/apache/spark/pull/4160 + + SPARK-5382: Use SPARK_CONF_DIR in spark-class if it is defined + Jacek Lewandowski + 2015-01-25 15:15:09 -0800 + Commit: 1c30afd, github.com/apache/spark/pull/4179 + + SPARK-3782 [CORE] Direct use of log4j in AkkaUtils interferes with certain logging configurations + Sean Owen + 2015-01-25 15:11:57 -0800 + Commit: 383425a, github.com/apache/spark/pull/4184 + + SPARK-3852 [DOCS] Document spark.driver.extra* configs + Sean Owen + 2015-01-25 15:08:05 -0800 + Commit: c586b45, github.com/apache/spark/pull/4185 + + [SPARK-5402] log executor ID at executor-construction time + Ryan Williams + 2015-01-25 14:20:02 -0800 + Commit: aea2548, github.com/apache/spark/pull/4195 + + [SPARK-5401] set executor ID before creating MetricsSystem + Ryan Williams + 2015-01-25 14:17:59 -0800 + Commit: 2d9887b, github.com/apache/spark/pull/4194 + + Add comment about defaultMinPartitions + Idan Zalzberg + 2015-01-25 11:28:05 -0800 + Commit: 412a58e, github.com/apache/spark/pull/4102 + + Closes #4157 + Reynold Xin + 2015-01-25 00:24:59 -0800 + Commit: d22ca1e + + [SPARK-5214][Test] Add a test to demonstrate EventLoop can be stopped in the event thread + zsxwing + 2015-01-24 11:00:35 -0800 + Commit: 0d1e67e, github.com/apache/spark/pull/4174 + + [SPARK-5058] Part 2. Typos and broken URL + Jongyoul Lee + 2015-01-23 23:34:11 -0800 + Commit: 09e09c5, github.com/apache/spark/pull/4172 + + [SPARK-5351][GraphX] Do not use Partitioner.defaultPartitioner as a partitioner of EdgeRDDImp... + Takeshi Yamamuro + 2015-01-23 19:25:15 -0800 + Commit: e224dbb, github.com/apache/spark/pull/4136 + + [SPARK-5063] More helpful error messages for several invalid operations + Josh Rosen + 2015-01-23 17:53:15 -0800 + Commit: cef1f09, github.com/apache/spark/pull/3884 + + [SPARK-3541][MLLIB] New ALS implementation with improved storage + Xiangrui Meng + 2015-01-22 22:09:13 -0800 + Commit: ea74365, github.com/apache/spark/pull/3720 + + [SPARK-5315][Streaming] Fix reduceByWindow Java API not work bug + jerryshao + 2015-01-22 22:04:21 -0800 + Commit: e0f7fb7, github.com/apache/spark/pull/4104 + + [SPARK-5233][Streaming] Fix error replaying of WAL introduced bug + jerryshao + 2015-01-22 21:58:53 -0800 + Commit: 3c3fa63, github.com/apache/spark/pull/4032 + + SPARK-5370. [YARN] Remove some unnecessary synchronization in YarnAlloca... + Sandy Ryza + 2015-01-22 13:49:35 -0600 + Commit: 820ce03, github.com/apache/spark/pull/4164 + + [SPARK-5365][MLlib] Refactor KMeans to reduce redundant data + Liang-Chi Hsieh + 2015-01-22 08:16:35 -0800 + Commit: 246111d, github.com/apache/spark/pull/4159 + + [SPARK-5147][Streaming] Delete the received data WAL log periodically + Tathagata Das , jerryshao + 2015-01-21 23:41:44 -0800 + Commit: 3027f06, github.com/apache/spark/pull/4149 + + [SPARK-5317]Set BoostingStrategy.defaultParams With Enumeration Algo.Classification or Algo.Regression + Basin + 2015-01-21 23:06:34 -0800 + Commit: fcb3e18, github.com/apache/spark/pull/4103 + + [SPARK-3424][MLLIB] cache point distances during k-means|| init + Xiangrui Meng + 2015-01-21 21:20:31 -0800 + Commit: ca7910d, github.com/apache/spark/pull/4144 + + [SPARK-5202] [SQL] Add hql variable substitution support + Cheng Hao + 2015-01-21 17:34:18 -0800 + Commit: 27bccc5, github.com/apache/spark/pull/4003 + + [SPARK-5355] make SparkConf thread-safe + Davies Liu + 2015-01-21 16:51:42 -0800 + Commit: 9bad062, github.com/apache/spark/pull/4143 + + [SPARK-4984][CORE][WEBUI] Adding a pop-up containing the full job description when it is very long + wangfei + 2015-01-21 15:27:42 -0800 + Commit: 3be2a88, github.com/apache/spark/pull/3819 + + [SQL] [Minor] Remove deprecated parquet tests + Cheng Lian + 2015-01-21 14:38:10 -0800 + Commit: ba19689, github.com/apache/spark/pull/4116 + + Revert "[SPARK-5244] [SQL] add coalesce() in sql parser" + Josh Rosen + 2015-01-21 14:27:43 -0800 + Commit: b328ac6 + + [SPARK-5009] [SQL] Long keyword support in SQL Parsers + Cheng Hao + 2015-01-21 13:05:56 -0800 + Commit: 8361078, github.com/apache/spark/pull/3926 + + [SPARK-5244] [SQL] add coalesce() in sql parser + Daoyuan Wang + 2015-01-21 12:59:41 -0800 + Commit: 812d367, github.com/apache/spark/pull/4040 + + [SPARK-5064][GraphX] Add numEdges upperbound validation for R-MAT graph generator to prevent infinite loop + Kenji Kikushima + 2015-01-21 12:34:00 -0800 + Commit: 3ee3ab5, github.com/apache/spark/pull/3950 + + [SPARK-4749] [mllib]: Allow initializing KMeans clusters using a seed + nate.crosswhite , nxwhite-str , Xiangrui Meng + 2015-01-21 10:32:10 -0800 + Commit: 7450a99, github.com/apache/spark/pull/3610 + + [MLlib] [SPARK-5301] Missing conversions and operations on IndexedRowMatrix and CoordinateMatrix + Reza Zadeh + 2015-01-21 09:48:38 -0800 + Commit: aa1e22b, github.com/apache/spark/pull/4089 + + SPARK-1714. Take advantage of AMRMClient APIs to simplify logic in YarnA... + Sandy Ryza + 2015-01-21 10:31:54 -0600 + Commit: 2eeada3, github.com/apache/spark/pull/3765 + + [SPARK-5336][YARN]spark.executor.cores must not be less than spark.task.cpus + WangTao , WangTaoTheTonic + 2015-01-21 09:42:30 -0600 + Commit: 8c06a5f, github.com/apache/spark/pull/4123 + + [SPARK-5297][Streaming] Fix Java file stream type erasure problem + jerryshao + 2015-01-20 23:37:47 -0800 + Commit: 424d8c6, github.com/apache/spark/pull/4101 + + [HOTFIX] Update pom.xml to pull MapR's Hadoop version 2.4.1. + Kannan Rajah + 2015-01-20 23:34:04 -0800 + Commit: ec5b0f2, github.com/apache/spark/pull/4108 + + [SPARK-5275] [Streaming] include python source code + Davies Liu + 2015-01-20 22:44:58 -0800 + Commit: bad6c57, github.com/apache/spark/pull/4128 + + [SPARK-5294][WebUI] Hide tables in AllStagePages for "Active Stages, Completed Stages and Failed Stages" when they are empty + Kousuke Saruta + 2015-01-20 16:40:46 -0800 + Commit: 9a151ce, github.com/apache/spark/pull/4083 + + [SPARK-5186] [MLLIB] Vector.equals and Vector.hashCode are very inefficient + Yuhao Yang , Yuhao Yang + 2015-01-20 15:20:20 -0800 + Commit: 2f82c84, github.com/apache/spark/pull/3997 + + [SPARK-5323][SQL] Remove Row's Seq inheritance. + Reynold Xin + 2015-01-20 15:16:14 -0800 + Commit: d181c2a, github.com/apache/spark/pull/4115 + + [SPARK-5287][SQL] Add defaultSizeOf to every data type. + Yin Huai + 2015-01-20 13:26:36 -0800 + Commit: bc20a52, github.com/apache/spark/pull/4081 + + SPARK-5019 [MLlib] - GaussianMixtureModel exposes instances of MultivariateGauss... + Travis Galoppo + 2015-01-20 12:58:11 -0800 + Commit: 23e2554, github.com/apache/spark/pull/4088 + + [SPARK-5329][WebUI] UIWorkloadGenerator should stop SparkContext. + Kousuke Saruta + 2015-01-20 12:40:55 -0800 + Commit: 769aced, github.com/apache/spark/pull/4112 + + SPARK-4660: Use correct class loader in JavaSerializer (copy of PR #3840... + Jacek Lewandowski + 2015-01-20 12:38:01 -0800 + Commit: c93a57f, github.com/apache/spark/pull/4113 + + [SQL][Minor] Refactors deeply nested FP style code in BooleanSimplification + Cheng Lian + 2015-01-20 11:20:14 -0800 + Commit: 8140802, github.com/apache/spark/pull/4091 + + [SPARK-5333][Mesos] MesosTaskLaunchData occurs BufferUnderflowException + Jongyoul Lee + 2015-01-20 10:17:29 -0800 + Commit: 9d9294a, github.com/apache/spark/pull/4119 + + [SPARK-4803] [streaming] Remove duplicate RegisterReceiver message + Ilayaperumal Gopinathan + 2015-01-20 01:41:10 -0800 + Commit: 4afad9c, github.com/apache/spark/pull/3648 + + [SQL][minor] Add a log4j file for catalyst test. + Reynold Xin + 2015-01-20 00:55:25 -0800 + Commit: debc031, github.com/apache/spark/pull/4117 + + SPARK-5270 [CORE] Provide isEmpty() function in RDD API + Sean Owen + 2015-01-19 22:50:44 -0800 + Commit: 306ff18, github.com/apache/spark/pull/4074 + + [SPARK-5214][Core] Add EventLoop and change DAGScheduler to an EventLoop + zsxwing + 2015-01-19 18:15:51 -0800 + Commit: e69fb8c, github.com/apache/spark/pull/4016 + + [SPARK-4504][Examples] fix run-example failure if multiple assembly jars exist + Venkata Ramana Gollamudi + 2015-01-19 11:58:16 -0800 + Commit: 74de94e, github.com/apache/spark/pull/3377 + + [SPARK-5286][SQL] Fail to drop an invalid table when using the data source API + Yin Huai + 2015-01-19 10:45:29 -0800 + Commit: 2604bc3, github.com/apache/spark/pull/4076 + + [SPARK-5284][SQL] Insert into Hive throws NPE when a inner complex type field has a null value + Yin Huai + 2015-01-19 10:44:12 -0800 + Commit: cd5da42, github.com/apache/spark/pull/4077 + + [SPARK-5282][mllib]: RowMatrix easily gets int overflow in the memory size warning + Yuhao Yang + 2015-01-19 10:10:15 -0800 + Commit: 4432568, github.com/apache/spark/pull/4069 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-01-19 02:05:24 -0800 + Commit: 1ac1c1d, github.com/apache/spark/pull/3584 + + [SPARK-5088] Use spark-class for running executors directly + Jongyoul Lee + 2015-01-19 02:01:56 -0800 + Commit: 4a4f9cc, github.com/apache/spark/pull/3897 + + [SPARK-3288] All fields in TaskMetrics should be private and use getters/setters + Ilya Ganelin + 2015-01-19 01:32:22 -0800 + Commit: 3453d57, github.com/apache/spark/pull/4020 + + SPARK-5217 Spark UI should report pending stages during job execution on AllStagesPage. + Prashant Sharma + 2015-01-19 01:28:42 -0800 + Commit: 851b6a9, github.com/apache/spark/pull/4043 + + [SQL] fix typo in class description + Jacky Li + 2015-01-18 23:59:08 -0800 + Commit: 7dbf1fd, github.com/apache/spark/pull/4100 + + [SQL][minor] Put DataTypes.java in java dir. + Reynold Xin + 2015-01-18 16:35:40 -0800 + Commit: 1955645, github.com/apache/spark/pull/4097 + + [SQL][Minor] Update sql doc according to data type APIs changes + scwf + 2015-01-18 11:03:13 -0800 + Commit: 1a200a3, github.com/apache/spark/pull/4095 + + [SPARK-5279][SQL] Use java.math.BigDecimal as the exposed Decimal type. + Reynold Xin + 2015-01-18 11:01:42 -0800 + Commit: 1727e08, github.com/apache/spark/pull/4092 + + [HOTFIX]: Minor clean up regarding skipped artifacts in build files. + Patrick Wendell + 2015-01-17 23:15:12 -0800 + Commit: ad16da1, github.com/apache/spark/pull/4080 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-01-17 20:39:54 -0800 + Commit: e12b5b6, github.com/apache/spark/pull/681 + + [SQL][Minor] Added comments and examples to explain BooleanSimplification + Reynold Xin + 2015-01-17 17:35:53 -0800 + Commit: e7884bc, github.com/apache/spark/pull/4090 + + [SPARK-5096] Use sbt tasks instead of vals to get hadoop version + Michael Armbrust + 2015-01-17 17:03:07 -0800 + Commit: 6999910, github.com/apache/spark/pull/3905 + + [SPARK-4937][SQL] Comment for the newly optimization rules in `BooleanSimplification` + scwf + 2015-01-17 15:51:24 -0800 + Commit: c1f3c27, github.com/apache/spark/pull/4086 + + [SQL][minor] Improved Row documentation. + Reynold Xin + 2015-01-17 00:11:08 -0800 + Commit: f3bfc76, github.com/apache/spark/pull/4085 + + [SPARK-5193][SQL] Remove Spark SQL Java-specific API. + Reynold Xin + 2015-01-16 21:09:06 -0800 + Commit: 61b427d, github.com/apache/spark/pull/4065 + + [SPARK-4937][SQL] Adding optimization to simplify the And, Or condition in spark sql + scwf , wangfei + 2015-01-16 14:01:22 -0800 + Commit: ee1c1f3, github.com/apache/spark/pull/3778 + + [SPARK-733] Add documentation on use of accumulators in lazy transformation + Ilya Ganelin + 2015-01-16 13:25:17 -0800 + Commit: fd3a8a1, github.com/apache/spark/pull/4022 + + [SPARK-4923][REPL] Add Developer API to REPL to allow re-publishing the REPL jar + Chip Senkbeil , Chip Senkbeil + 2015-01-16 12:56:40 -0800 + Commit: d05c9ee, github.com/apache/spark/pull/4034 + + [WebUI] Fix collapse of WebUI layout + Kousuke Saruta + 2015-01-16 12:19:08 -0800 + Commit: ecf943d, github.com/apache/spark/pull/3995 + + [SPARK-5231][WebUI] History Server shows wrong job submission time. + Kousuke Saruta + 2015-01-16 10:05:11 -0800 + Commit: e8422c5, github.com/apache/spark/pull/4029 + + [DOCS] Fix typo in return type of cogroup + Sean Owen + 2015-01-16 09:28:44 -0800 + Commit: f6b852a, github.com/apache/spark/pull/4072 + + [SPARK-5201][CORE] deal with int overflow in the ParallelCollectionRDD.slice method + Ye Xianjin + 2015-01-16 09:20:53 -0800 + Commit: e200ac8, github.com/apache/spark/pull/4002 + + [SPARK-1507][YARN]specify # cores for ApplicationMaster + WangTaoTheTonic , WangTao + 2015-01-16 09:16:56 -0800 + Commit: 2be82b1, github.com/apache/spark/pull/4018 + + [SPARK-4092] [CORE] Fix InputMetrics for coalesce'd Rdds + Kostas Sakellis + 2015-01-15 18:48:39 -0800 + Commit: a79a9f9, github.com/apache/spark/pull/3120 + + [SPARK-4857] [CORE] Adds Executor membership events to SparkListener + Kostas Sakellis + 2015-01-15 17:53:42 -0800 + Commit: 96c2c71, github.com/apache/spark/pull/3711 + + [Minor] Fix tiny typo in BlockManager + Kousuke Saruta + 2015-01-15 17:07:44 -0800 + Commit: 65858ba, github.com/apache/spark/pull/4046 + + [SPARK-5274][SQL] Reconcile Java and Scala UDFRegistration. + Reynold Xin + 2015-01-15 16:15:12 -0800 + Commit: 1881431, github.com/apache/spark/pull/4056 + + [SPARK-5224] [PySpark] improve performance of parallelize list/ndarray + Davies Liu + 2015-01-15 11:40:41 -0800 + Commit: 3c8650c, github.com/apache/spark/pull/4024 + + [SPARK-5193][SQL] Tighten up HiveContext API + Reynold Xin + 2015-01-14 20:31:02 -0800 + Commit: 4b325c7, github.com/apache/spark/pull/4054 + + [SPARK-5254][MLLIB] remove developers section from spark.ml guide + Xiangrui Meng + 2015-01-14 18:54:17 -0800 + Commit: 6abc45e, github.com/apache/spark/pull/4053 + + [SPARK-5193][SQL] Tighten up SQLContext API + Reynold Xin + 2015-01-14 18:36:15 -0800 + Commit: cfa397c, github.com/apache/spark/pull/4049 + + [SPARK-5254][MLLIB] Update the user guide to position spark.ml better + Xiangrui Meng + 2015-01-14 17:50:33 -0800 + Commit: 13d2406, github.com/apache/spark/pull/4052 + + [SPARK-5234][ml]examples for ml don't have sparkContext.stop + Yuhao Yang + 2015-01-14 11:53:43 -0800 + Commit: 76389c5, github.com/apache/spark/pull/4044 + + [SPARK-5235] Make SQLConf Serializable + Alex Baretta + 2015-01-14 11:51:55 -0800 + Commit: 2fd7f72, github.com/apache/spark/pull/4031 + + [SPARK-4014] Add TaskContext.attemptNumber and deprecate TaskContext.attemptId + Josh Rosen + 2015-01-14 11:45:40 -0800 + Commit: 259936b, github.com/apache/spark/pull/3849 + + [SPARK-5228][WebUI] Hide tables for "Active Jobs/Completed Jobs/Failed Jobs" when they are empty + Kousuke Saruta + 2015-01-14 11:10:29 -0800 + Commit: 9d4449c, github.com/apache/spark/pull/4028 + + [SPARK-2909] [MLlib] [PySpark] SparseVector in pyspark now supports indexing + MechCoder + 2015-01-14 11:03:11 -0800 + Commit: 5840f54, github.com/apache/spark/pull/4025 + + [SQL] some comments fix for GROUPING SETS + Daoyuan Wang + 2015-01-14 09:50:01 -0800 + Commit: 38bdc99, github.com/apache/spark/pull/4000 + + [SPARK-5211][SQL]Restore HiveMetastoreTypes.toDataType + Yin Huai + 2015-01-14 09:47:30 -0800 + Commit: 81f72a0, github.com/apache/spark/pull/4026 + + [SPARK-5248] [SQL] move sql.types.decimal.Decimal to sql.types.Decimal + Daoyuan Wang + 2015-01-14 09:36:59 -0800 + Commit: a3f7421, github.com/apache/spark/pull/4041 + + [SPARK-5167][SQL] Move Row into sql package and make it usable for Java. + Reynold Xin + 2015-01-14 00:38:55 -0800 + Commit: d5eeb35, github.com/apache/spark/pull/4030 + + [SPARK-5123][SQL] Reconcile Java/Scala API for data types. + Reynold Xin + 2015-01-13 17:16:41 -0800 + Commit: f996909, github.com/apache/spark/pull/3958 + + [SPARK-5168] Make SQLConf a field rather than mixin in SQLContext + Reynold Xin + 2015-01-13 13:30:35 -0800 + Commit: 14e3f11, github.com/apache/spark/pull/3965 + + [SPARK-4912][SQL] Persistent tables for the Spark SQL data sources api + Yin Huai , Michael Armbrust + 2015-01-13 13:01:27 -0800 + Commit: 6463e0b, github.com/apache/spark/pull/3960 + + [SPARK-5223] [MLlib] [PySpark] fix MapConverter and ListConverter in MLlib + Davies Liu + 2015-01-13 12:50:31 -0800 + Commit: 8ead999, github.com/apache/spark/pull/4023 + + [SPARK-5131][Streaming][DOC]: There is a discrepancy in WAL implementation and configuration doc. + uncleGen + 2015-01-13 10:07:19 -0800 + Commit: 39e333e, github.com/apache/spark/pull/3930 + + [SPARK-4697][YARN]System properties should override environment variables + WangTaoTheTonic , WangTao + 2015-01-13 09:43:48 -0800 + Commit: 9dea64e, github.com/apache/spark/pull/3557 + + [SPARK-5006][Deploy]spark.port.maxRetries doesn't work + WangTaoTheTonic , WangTao + 2015-01-13 09:28:21 -0800 + Commit: f7741a9, github.com/apache/spark/pull/3841 + + [SPARK-5138][SQL] Ensure schema can be inferred from a namedtuple + Gabe Mulley + 2015-01-12 21:44:51 -0800 + Commit: 1e42e96, github.com/apache/spark/pull/3978 + + [SPARK-5049][SQL] Fix ordering of partition columns in ParquetTableScan + Michael Armbrust + 2015-01-12 15:19:09 -0800 + Commit: 5d9fa55, github.com/apache/spark/pull/3990 + + [SPARK-4999][Streaming] Change storeInBlockManager to false by default + jerryshao + 2015-01-12 13:14:44 -0800 + Commit: 3aed305, github.com/apache/spark/pull/3906 + + SPARK-5172 [BUILD] spark-examples-***.jar shades a wrong Hadoop distribution + Sean Owen + 2015-01-12 12:15:34 -0800 + Commit: aff49a3, github.com/apache/spark/pull/3992 + + [SPARK-5078] Optionally read from SPARK_LOCAL_HOSTNAME + Michael Armbrust + 2015-01-12 11:57:59 -0800 + Commit: a3978f3, github.com/apache/spark/pull/3893 + + SPARK-4159 [BUILD] Addendum: improve running of single test after enabling Java tests + Sean Owen + 2015-01-12 11:00:56 -0800 + Commit: 13e610b, github.com/apache/spark/pull/3993 + + [SPARK-5102][Core]subclass of MapStatus needs to be registered with Kryo + lianhuiwang + 2015-01-12 10:57:12 -0800 + Commit: ef9224e, github.com/apache/spark/pull/4007 + + [SPARK-5200] Disable web UI in Hive ThriftServer tests + Josh Rosen + 2015-01-12 10:47:12 -0800 + Commit: 82fd38d, github.com/apache/spark/pull/3998 + + SPARK-5018 [MLlib] [WIP] Make MultivariateGaussian public + Travis Galoppo + 2015-01-11 21:31:16 -0800 + Commit: 2130de9, github.com/apache/spark/pull/3923 + + [SPARK-4033][Examples]Input of the SparkPi too big causes the emption exception + huangzhaowei + 2015-01-11 16:32:47 -0800 + Commit: f38ef65, github.com/apache/spark/pull/2874 + + [SPARK-4951][Core] Fix the issue that a busy executor may be killed + zsxwing + 2015-01-11 16:23:28 -0800 + Commit: 6942b97, github.com/apache/spark/pull/3783 + + [SPARK-5073] spark.storage.memoryMapThreshold have two default value + lewuathe + 2015-01-11 13:50:42 -0800 + Commit: 1656aae, github.com/apache/spark/pull/3900 + + [SPARK-5032] [graphx] Remove GraphX MIMA exclude for 1.3 + Joseph K. Bradley + 2015-01-10 17:25:39 -0800 + Commit: 3313260, github.com/apache/spark/pull/3856 + + [SPARK-5029][SQL] Enable from follow multiple brackets + scwf + 2015-01-10 17:07:34 -0800 + Commit: d22a31f, github.com/apache/spark/pull/3853 + + [SPARK-4871][SQL] Show sql statement in spark ui when run sql with spark-sql + wangfei + 2015-01-10 17:04:56 -0800 + Commit: 92d9a70, github.com/apache/spark/pull/3718 + + [Minor]Resolve sbt warnings during build (MQTTStreamSuite.scala). + GuoQiang Li + 2015-01-10 15:38:43 -0800 + Commit: 8a29dc7, github.com/apache/spark/pull/3989 + + [SPARK-5181] do not print writing WAL log when WAL is disabled + CodingCat + 2015-01-10 15:35:41 -0800 + Commit: f0d558b, github.com/apache/spark/pull/3985 + + [SPARK-4692] [SQL] Support ! boolean logic operator like NOT + YanTangZhai , Michael Armbrust + 2015-01-10 15:05:23 -0800 + Commit: 0ca51cc, github.com/apache/spark/pull/3555 + + [SPARK-5187][SQL] Fix caching of tables with HiveUDFs in the WHERE clause + Michael Armbrust + 2015-01-10 14:25:45 -0800 + Commit: 3684fd2, github.com/apache/spark/pull/3987 + + SPARK-4963 [SQL] Add copy to SQL's Sample operator + Yanbo Liang + 2015-01-10 14:16:37 -0800 + Commit: 77106df, github.com/apache/spark/pull/3827 + + [SPARK-4861][SQL] Refactory command in spark sql + scwf + 2015-01-10 14:08:04 -0800 + Commit: b3e86dc, github.com/apache/spark/pull/3948 + + [SPARK-4574][SQL] Adding support for defining schema in foreign DDL commands. + scwf , Yin Huai , Fei Wang , wangfei + 2015-01-10 13:53:21 -0800 + Commit: 693a323, github.com/apache/spark/pull/3431 + + [SPARK-4943][SQL] Allow table name having dot for db/catalog + Alex Liu + 2015-01-10 13:23:09 -0800 + Commit: 4b39fd1, github.com/apache/spark/pull/3941 + + [SPARK-4925][SQL] Publish Spark SQL hive-thriftserver maven artifact + Alex Liu + 2015-01-10 13:19:12 -0800 + Commit: 1e56eba, github.com/apache/spark/pull/3766 + + [SPARK-5141][SQL]CaseInsensitiveMap throws java.io.NotSerializableException + luogankun + 2015-01-09 20:38:41 -0800 + Commit: 545dfcb, github.com/apache/spark/pull/3944 + + [SPARK-4406] [MLib] FIX: Validate k in SVD + MechCoder + 2015-01-09 17:45:18 -0800 + Commit: 4554529, github.com/apache/spark/pull/3945 + + [SPARK-4990][Deploy]to find default properties file, search SPARK_CONF_DIR first + WangTaoTheTonic , WangTao + 2015-01-09 17:10:02 -0800 + Commit: 8782eb9, github.com/apache/spark/pull/3823 + + [Minor] Fix import order and other coding style + bilna , Bilna P + 2015-01-09 14:45:28 -0800 + Commit: 4e1f12d, github.com/apache/spark/pull/3966 + + [DOC] Fixed Mesos version in doc from 0.18.1 to 0.21.0 + Kousuke Saruta + 2015-01-09 14:40:45 -0800 + Commit: ae62872, github.com/apache/spark/pull/3982 + + [SPARK-4737] Task set manager properly handles serialization errors + mcheah + 2015-01-09 14:16:20 -0800 + Commit: e0f28e0, github.com/apache/spark/pull/3638 + + [SPARK-1953][YARN]yarn client mode Application Master memory size is same as driver memory... + WangTaoTheTonic + 2015-01-09 13:20:32 -0800 + Commit: e966452, github.com/apache/spark/pull/3607 + + [SPARK-5015] [mllib] Random seed for GMM + make test suite deterministic + Joseph K. Bradley + 2015-01-09 13:00:15 -0800 + Commit: 7e8e62a, github.com/apache/spark/pull/3981 + + [SPARK-3619] Upgrade to Mesos 0.21 to work around MESOS-1688 + Jongyoul Lee + 2015-01-09 10:47:08 -0800 + Commit: 454fe12, github.com/apache/spark/pull/3934 + + [SPARK-5145][Mllib] Add BLAS.dsyr and use it in GaussianMixtureEM + Liang-Chi Hsieh + 2015-01-09 10:27:33 -0800 + Commit: e9ca16e, github.com/apache/spark/pull/3949 + + [SPARK-1143] Separate pool tests into their own suite. + Kay Ousterhout + 2015-01-09 09:47:06 -0800 + Commit: b6aa557, github.com/apache/spark/pull/3967 + + HOTFIX: Minor improvements to make-distribution.sh + Patrick Wendell + 2015-01-09 09:40:18 -0800 + Commit: 1790b38, github.com/apache/spark/pull/3973 + + SPARK-5136 [DOCS] Improve documentation around setting up Spark IntelliJ project + Sean Owen + 2015-01-09 09:35:46 -0800 + Commit: 547df97, github.com/apache/spark/pull/3952 + + [Minor] Fix test RetryingBlockFetcherSuite after changed config name + Aaron Davidson + 2015-01-09 09:20:16 -0800 + Commit: b4034c3, github.com/apache/spark/pull/3972 + + [SPARK-5169][YARN]fetch the correct max attempts + WangTaoTheTonic + 2015-01-09 08:10:09 -0600 + Commit: f3da4bd, github.com/apache/spark/pull/3942 + + [SPARK-5122] Remove Shark from spark-ec2 + Nicholas Chammas + 2015-01-08 17:42:08 -0800 + Commit: 167a5ab, github.com/apache/spark/pull/3939 + + [SPARK-4048] Enhance and extend hadoop-provided profile. + Marcelo Vanzin + 2015-01-08 17:15:13 -0800 + Commit: 48cecf6, github.com/apache/spark/pull/2982 + + [SPARK-4891][PySpark][MLlib] Add gamma/log normal/exp dist sampling to P... + RJ Nowling + 2015-01-08 15:03:43 -0800 + Commit: c9c8b21, github.com/apache/spark/pull/3955 + + [SPARK-4973][CORE] Local directory in the driver of client-mode continues remaining even if application finished when external shuffle is enabled + Kousuke Saruta + 2015-01-08 13:43:09 -0800 + Commit: a00af6b, github.com/apache/spark/pull/3811 + + SPARK-5148 [MLlib] Make usersOut/productsOut storagelevel in ALS configurable + Fernando Otero (ZeoS) + 2015-01-08 12:42:54 -0800 + Commit: 72df5a3, github.com/apache/spark/pull/3953 + + Document that groupByKey will OOM for large keys + Eric Moyer + 2015-01-08 11:55:23 -0800 + Commit: 538f221, github.com/apache/spark/pull/3936 + + [SPARK-5130][Deploy]Take yarn-cluster as cluster mode in spark-submit + WangTaoTheTonic + 2015-01-08 11:45:42 -0800 + Commit: 0760787, github.com/apache/spark/pull/3929 + + [Minor] Fix the value represented by spark.executor.id for consistency. + Kousuke Saruta + 2015-01-08 11:35:56 -0800 + Commit: 0a59727, github.com/apache/spark/pull/3812 + + [SPARK-4989][CORE] avoid wrong eventlog conf cause cluster down in standalone mode + Zhang, Liye + 2015-01-08 10:40:26 -0800 + Commit: 06dc4b5, github.com/apache/spark/pull/3824 + + [SPARK-4917] Add a function to convert into a graph with canonical edges in GraphOps + Takeshi Yamamuro + 2015-01-08 09:55:12 -0800 + Commit: f825e19, github.com/apache/spark/pull/3760 + + SPARK-5087. [YARN] Merge yarn.Client and yarn.ClientBase + Sandy Ryza + 2015-01-08 09:25:43 -0800 + Commit: 8d45834, github.com/apache/spark/pull/3896 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2015-01-07 23:25:56 -0800 + Commit: c082385, github.com/apache/spark/pull/3880 + + [SPARK-5116][MLlib] Add extractor for SparseVector and DenseVector + Shuo Xiang + 2015-01-07 23:22:37 -0800 + Commit: c66a976, github.com/apache/spark/pull/3919 + + [SPARK-5126][Core] Verify Spark urls before creating Actors so that invalid urls can crash the process. + zsxwing + 2015-01-07 23:01:30 -0800 + Commit: 2b729d2, github.com/apache/spark/pull/3927 + + [SPARK-5132][Core]Correct stage Attempt Id key in stageInfofromJson + hushan[胡珊] + 2015-01-07 12:09:12 -0800 + Commit: d345ebe, github.com/apache/spark/pull/3932 + + [SPARK-5128][MLLib] Add common used log1pExp API in MLUtils + DB Tsai + 2015-01-07 10:13:41 -0800 + Commit: 60e2d9e, github.com/apache/spark/pull/3915 + + [SPARK-2458] Make failed application log visible on History Server + Masayoshi TSUZUKI + 2015-01-07 07:32:16 -0800 + Commit: 6e74ede, github.com/apache/spark/pull/3467 + + [SPARK-2165][YARN]add support for setting maxAppAttempts in the ApplicationSubmissionContext + WangTaoTheTonic + 2015-01-07 08:14:39 -0600 + Commit: 8fdd489, github.com/apache/spark/pull/3878 + + [YARN][SPARK-4929] Bug fix: fix the yarn-client code to support HA + huangzhaowei + 2015-01-07 08:10:42 -0600 + Commit: 5fde661, github.com/apache/spark/pull/3771 + + [SPARK-5099][Mllib] Simplify logistic loss function + Liang-Chi Hsieh + 2015-01-06 21:23:31 -0800 + Commit: e21acc1, github.com/apache/spark/pull/3899 + + [SPARK-5050][Mllib] Add unit test for sqdist + Liang-Chi Hsieh + 2015-01-06 14:00:45 -0800 + Commit: bb38ebb, github.com/apache/spark/pull/3869 + + SPARK-5017 [MLlib] - Use SVD to compute determinant and inverse of covariance matrix + Travis Galoppo + 2015-01-06 13:57:42 -0800 + Commit: 4108e5f, github.com/apache/spark/pull/3871 + + SPARK-4159 [CORE] Maven build doesn't run JUnit test suites + Sean Owen + 2015-01-06 12:02:08 -0800 + Commit: 4cba6eb, github.com/apache/spark/pull/3651 + + [Minor] Fix comments for GraphX 2D partitioning strategy + kj-ki + 2015-01-06 09:49:37 -0800 + Commit: 5e3ec11, github.com/apache/spark/pull/3904 + + [SPARK-1600] Refactor FileInputStream tests to remove Thread.sleep() calls and SystemClock usage + Josh Rosen + 2015-01-06 00:31:19 -0800 + Commit: a6394bc, github.com/apache/spark/pull/3801 + + SPARK-4843 [YARN] Squash ExecutorRunnableUtil and ExecutorRunnable + Kostas Sakellis + 2015-01-05 23:26:33 -0800 + Commit: 451546a, github.com/apache/spark/pull/3696 + + [SPARK-5040][SQL] Support expressing unresolved attributes using $"attribute name" notation in SQL DSL. + Reynold Xin + 2015-01-05 15:34:22 -0800 + Commit: 04d55d8, github.com/apache/spark/pull/3862 + + [SPARK-5093] Set spark.network.timeout to 120s consistently. + Reynold Xin + 2015-01-05 15:19:53 -0800 + Commit: bbcba3a, github.com/apache/spark/pull/3903 + + [SPARK-5089][PYSPARK][MLLIB] Fix vector convert + freeman + 2015-01-05 13:10:59 -0800 + Commit: 6c6f325, github.com/apache/spark/pull/3902 + + [SPARK-4465] runAsSparkUser doesn't affect TaskRunner in Mesos environme... + Jongyoul Lee + 2015-01-05 12:05:09 -0800 + Commit: 1c0e7ce, github.com/apache/spark/pull/3741 + + [SPARK-5057] Log message in failed askWithReply attempts + WangTao , WangTaoTheTonic + 2015-01-05 11:59:38 -0800 + Commit: ce39b34, github.com/apache/spark/pull/3875 + + [SPARK-4688] Have a single shared network timeout in Spark + Varun Saxena , varunsaxena + 2015-01-05 10:32:37 -0800 + Commit: d3f07fd, github.com/apache/spark/pull/3562 + + [SPARK-5074][Core] Fix a non-deterministic test failure + zsxwing + 2015-01-04 21:18:33 -0800 + Commit: 5c506ce, github.com/apache/spark/pull/3889 + + [SPARK-5083][Core] Fix a flaky test in TaskResultGetterSuite + zsxwing + 2015-01-04 21:09:21 -0800 + Commit: 27e7f5a, github.com/apache/spark/pull/3894 + + [SPARK-5069][Core] Fix the race condition of TaskSchedulerImpl.dagScheduler + zsxwing + 2015-01-04 21:06:04 -0800 + Commit: 6c726a3, github.com/apache/spark/pull/3887 + + [SPARK-5067][Core] Use '===' to compare well-defined case class + zsxwing + 2015-01-04 21:03:17 -0800 + Commit: 7239652, github.com/apache/spark/pull/3886 + + [SPARK-4835] Disable validateOutputSpecs for Spark Streaming jobs + Josh Rosen + 2015-01-04 20:26:18 -0800 + Commit: 939ba1f, github.com/apache/spark/pull/3832 + + [SPARK-4631] unit test for MQTT + bilna , Bilna P + 2015-01-04 19:37:48 -0800 + Commit: e767d7d, github.com/apache/spark/pull/3844 + + [SPARK-4787] Stop SparkContext if a DAGScheduler init error occurs + Dale + 2015-01-04 13:28:37 -0800 + Commit: 3fddc94, github.com/apache/spark/pull/3809 + + [SPARK-794][Core] Remove sleep() in ClusterScheduler.stop + Brennon York + 2015-01-04 12:40:39 -0800 + Commit: b96008d, github.com/apache/spark/pull/3851 + + [SPARK-5058] Updated broken links + sigmoidanalytics + 2015-01-03 19:46:08 -0800 + Commit: 342612b, github.com/apache/spark/pull/3877 + + Fixed typos in streaming-kafka-integration.md + Akhil Das + 2015-01-02 15:12:27 -0800 + Commit: cdccc26, github.com/apache/spark/pull/3876 + + [SPARK-3325][Streaming] Add a parameter to the method print in class DStream + Yadong Qi , q00251598 , Tathagata Das , wangfei + 2015-01-02 15:09:41 -0800 + Commit: bd88b71, github.com/apache/spark/pull/3865 + + [HOTFIX] Bind web UI to ephemeral port in DriverSuite + Josh Rosen + 2015-01-01 15:03:54 -0800 + Commit: 0128398, github.com/apache/spark/pull/3873 + + [SPARK-5038] Add explicit return type for implicit functions. + Reynold Xin + 2014-12-31 17:07:47 -0800 + Commit: 7749dd6, github.com/apache/spark/pull/3860 + + SPARK-2757 [BUILD] [STREAMING] Add Mima test for Spark Sink after 1.10 is released + Sean Owen + 2014-12-31 16:59:17 -0800 + Commit: 4bb1248, github.com/apache/spark/pull/3842 + + [SPARK-5035] [Streaming] ReceiverMessage trait should extend Serializable + Josh Rosen + 2014-12-31 16:02:47 -0800 + Commit: fe6efac, github.com/apache/spark/pull/3857 + + SPARK-5020 [MLlib] GaussianMixtureModel.predictMembership() should take an RDD only + Travis Galoppo + 2014-12-31 15:39:58 -0800 + Commit: c4f0b4f, github.com/apache/spark/pull/3854 + + [SPARK-5028][Streaming]Add total received and processed records metrics to Streaming UI + jerryshao + 2014-12-31 14:45:31 -0800 + Commit: fdc2aa4, github.com/apache/spark/pull/3852 + + [SPARK-4790][STREAMING] Fix ReceivedBlockTrackerSuite waits for old file... + Hari Shreedharan + 2014-12-31 14:35:07 -0800 + Commit: 3610d3c, github.com/apache/spark/pull/3726 + + [SPARK-5038][SQL] Add explicit return type for implicit functions in Spark SQL + Reynold Xin + 2014-12-31 14:25:03 -0800 + Commit: c88a3d7, github.com/apache/spark/pull/3859 + + [HOTFIX] Disable Spark UI in SparkSubmitSuite tests + Josh Rosen + 2014-12-12 12:38:37 -0800 + Commit: e24d3a9 + + SPARK-4547 [MLLIB] OOM when making bins in BinaryClassificationMetrics + Sean Owen + 2014-12-31 13:37:04 -0800 + Commit: 3d194cc, github.com/apache/spark/pull/3702 + + [SPARK-4298][Core] - The spark-submit cannot read Main-Class from Manifest. + Brennon York + 2014-12-31 11:54:10 -0800 + Commit: 8e14c5e, github.com/apache/spark/pull/3561 + + [SPARK-4797] Replace breezeSquaredDistance + Liang-Chi Hsieh + 2014-12-31 11:50:53 -0800 + Commit: 06a9aa5, github.com/apache/spark/pull/3643 + + [SPARK-1010] Clean up uses of System.setProperty in unit tests + Josh Rosen + 2014-12-30 18:12:20 -0800 + Commit: 352ed6b, github.com/apache/spark/pull/3739 + + [SPARK-4998][MLlib]delete the "train" function + Liu Jiongzhou + 2014-12-30 15:55:56 -0800 + Commit: 035bac8, github.com/apache/spark/pull/3836 + + [SPARK-4813][Streaming] Fix the issue that ContextWaiter didn't handle 'spurious wakeup' + zsxwing + 2014-12-30 14:39:13 -0800 + Commit: 6a89782, github.com/apache/spark/pull/3661 + + [Spark-4995] Replace Vector.toBreeze.activeIterator with foreachActive + Jakub Dubovsky + 2014-12-30 14:19:07 -0800 + Commit: 0f31992, github.com/apache/spark/pull/3846 + + SPARK-3955 part 2 [CORE] [HOTFIX] Different versions between jackson-mapper-asl and jackson-core-asl + Sean Owen + 2014-12-30 14:00:57 -0800 + Commit: b239ea1, github.com/apache/spark/pull/3829 + + [SPARK-4570][SQL]add BroadcastLeftSemiJoinHash + wangxiaojing + 2014-12-30 13:54:12 -0800 + Commit: 07fa191, github.com/apache/spark/pull/3442 + + [SPARK-4935][SQL] When hive.cli.print.header configured, spark-sql aborted if passed in a invalid sql + wangfei , Fei Wang + 2014-12-30 13:44:30 -0800 + Commit: 8f29b7c, github.com/apache/spark/pull/3761 + + [SPARK-4386] Improve performance when writing Parquet files + Michael Davies + 2014-12-30 13:40:51 -0800 + Commit: 7425bec, github.com/apache/spark/pull/3843 + + [SPARK-4937][SQL] Normalizes conjunctions and disjunctions to eliminate common predicates + Cheng Lian + 2014-12-30 13:38:27 -0800 + Commit: 61a99f6, github.com/apache/spark/pull/3784 + + [SPARK-4928][SQL] Fix: Operator '>,<,>=,<=' with decimal between different precision report error + guowei2 + 2014-12-30 12:21:00 -0800 + Commit: a75dd83, github.com/apache/spark/pull/3767 + + [SPARK-4930][SQL][DOCS]Update SQL programming guide, CACHE TABLE is eager + luogankun + 2014-12-30 12:18:55 -0800 + Commit: 2deac74, github.com/apache/spark/pull/3773 + + [SPARK-4916][SQL][DOCS]Update SQL programming guide about cache section + luogankun + 2014-12-30 12:17:49 -0800 + Commit: f7a41a0, github.com/apache/spark/pull/3759 + + [SPARK-4493][SQL] Tests for IsNull / IsNotNull in the ParquetFilterSuite + Cheng Lian + 2014-12-30 12:16:45 -0800 + Commit: 19a8802, github.com/apache/spark/pull/3748 + + [Spark-4512] [SQL] Unresolved Attribute Exception in Sort By + Cheng Hao + 2014-12-30 12:11:44 -0800 + Commit: 53f0a00, github.com/apache/spark/pull/3386 + + [SPARK-5002][SQL] Using ascending by default when not specify order in order by + wangfei + 2014-12-30 12:07:24 -0800 + Commit: daac221, github.com/apache/spark/pull/3838 + + [SPARK-4904] [SQL] Remove the unnecessary code change in Generic UDF + Cheng Hao + 2014-12-30 11:47:08 -0800 + Commit: 63b84b7, github.com/apache/spark/pull/3745 + + [SPARK-4959] [SQL] Attributes are case sensitive when using a select query from a projection + Cheng Hao + 2014-12-30 11:33:47 -0800 + Commit: 5595eaa, github.com/apache/spark/pull/3796 + + [SPARK-4975][SQL] Fix HiveInspectorSuite test failure + scwf , Fei Wang + 2014-12-30 11:30:47 -0800 + Commit: 65357f1, github.com/apache/spark/pull/3814 + + [SQL] enable view test + Daoyuan Wang + 2014-12-30 11:29:13 -0800 + Commit: 94d60b7, github.com/apache/spark/pull/3826 + + [SPARK-4908][SQL] Prevent multiple concurrent hive native commands + Michael Armbrust + 2014-12-30 11:24:46 -0800 + Commit: 480bd1d, github.com/apache/spark/pull/3834 + + [SPARK-4882] Register PythonBroadcast with Kryo so that PySpark works with KryoSerializer + Josh Rosen + 2014-12-30 09:29:52 -0800 + Commit: efa80a53, github.com/apache/spark/pull/3831 + + [SPARK-4920][UI] add version on master and worker page for standalone mode + Zhang, Liye + 2014-12-30 09:19:47 -0800 + Commit: 9077e72, github.com/apache/spark/pull/3769 + + [SPARK-4972][MLlib] Updated the scala doc for lasso and ridge regression for the change of LeastSquaresGradient + DB Tsai + 2014-12-29 17:17:12 -0800 + Commit: 040d6f2, github.com/apache/spark/pull/3808 + + Added setMinCount to Word2Vec.scala + ganonp + 2014-12-29 15:31:19 -0800 + Commit: 343db39, github.com/apache/spark/pull/3693 + + SPARK-4156 [MLLIB] EM algorithm for GMMs + Travis Galoppo , Travis Galoppo , tgaloppo , FlytxtRnD + 2014-12-29 15:29:15 -0800 + Commit: 6cf6fdf, github.com/apache/spark/pull/3022 + + SPARK-4968: takeOrdered to skip reduce step in case mappers return no partitions + Yash Datta + 2014-12-29 13:49:45 -0800 + Commit: 9bc0df6, github.com/apache/spark/pull/3830 + + [SPARK-4409][MLlib] Additional Linear Algebra Utils + Burak Yavuz , Xiangrui Meng + 2014-12-29 13:24:26 -0800 + Commit: 02b55de, github.com/apache/spark/pull/3319 + + [Minor] Fix a typo of type parameter in JavaUtils.scala + Kousuke Saruta + 2014-12-29 12:05:08 -0800 + Commit: 8d72341, github.com/apache/spark/pull/3789 + + [SPARK-4946] [CORE] Using AkkaUtils.askWithReply in MapOutputTracker.askTracker to reduce the chance of the communicating problem + YanTangZhai , yantangzhai + 2014-12-29 11:30:54 -0800 + Commit: 815de54, github.com/apache/spark/pull/3785 + + Adde LICENSE Header to build/mvn, build/sbt and sbt/sbt + Kousuke Saruta + 2014-12-29 10:48:53 -0800 + Commit: 4cef05e, github.com/apache/spark/pull/3817 + + [SPARK-4982][DOC] `spark.ui.retainedJobs` description is wrong in Spark UI configuration guide + wangxiaojing + 2014-12-29 10:45:14 -0800 + Commit: 6645e52, github.com/apache/spark/pull/3818 + + [SPARK-4966][YARN]The MemoryOverhead value is setted not correctly + meiyoula <1039320815@qq.com> + 2014-12-29 08:20:30 -0600 + Commit: 14fa87b, github.com/apache/spark/pull/3797 + + [SPARK-4501][Core] - Create build/mvn to automatically download maven/zinc/scalac + Brennon York + 2014-12-27 13:25:18 -0800 + Commit: a3e51cc, github.com/apache/spark/pull/3707 + + [SPARK-4952][Core]Handle ConcurrentModificationExceptions in SparkEnv.environmentDetails + GuoQiang Li + 2014-12-26 23:31:29 -0800 + Commit: 080ceb7, github.com/apache/spark/pull/3788 + + [SPARK-4954][Core] add spark version infomation in log for standalone mode + Zhang, Liye + 2014-12-26 23:23:13 -0800 + Commit: 786808a, github.com/apache/spark/pull/3790 + + [SPARK-3955] Different versions between jackson-mapper-asl and jackson-c... + Jongyoul Lee + 2014-12-26 22:59:34 -0800 + Commit: 2483c1e, github.com/apache/spark/pull/3716 + + HOTFIX: Slight tweak on previous commit. + Patrick Wendell + 2014-12-26 22:55:04 -0800 + Commit: 82bf4be + + [SPARK-3787][BUILD] Assembly jar name is wrong when we build with sbt omitting -Dhadoop.version + Kousuke Saruta + 2014-12-26 22:52:04 -0800 + Commit: de95c57, github.com/apache/spark/pull/3046 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-12-26 22:39:56 -0800 + Commit: 534f24b, github.com/apache/spark/pull/3456 + + SPARK-4971: Fix typo in BlockGenerator comment + CodingCat + 2014-12-26 12:03:22 -0800 + Commit: fda4331, github.com/apache/spark/pull/3807 + + [SPARK-4608][Streaming] Reorganize StreamingContext implicit to improve API convenience + zsxwing + 2014-12-25 19:46:05 -0800 + Commit: f9ed2b6, github.com/apache/spark/pull/3464 + + [SPARK-4537][Streaming] Expand StreamingSource to add more metrics + jerryshao + 2014-12-25 19:39:49 -0800 + Commit: f205fe4, github.com/apache/spark/pull/3466 + + [EC2] Update mesos/spark-ec2 branch to branch-1.3 + Nicholas Chammas + 2014-12-25 14:16:50 -0800 + Commit: ac82785, github.com/apache/spark/pull/3804 + + [EC2] Update default Spark version to 1.2.0 + Nicholas Chammas + 2014-12-25 14:13:12 -0800 + Commit: b6b6393, github.com/apache/spark/pull/3793 + + Fix "Building Spark With Maven" link in README.md + Denny Lee + 2014-12-25 14:05:55 -0800 + Commit: 08b18c7, github.com/apache/spark/pull/3802 + + [SPARK-4953][Doc] Fix the description of building Spark with YARN + Kousuke Saruta + 2014-12-25 07:05:43 -0800 + Commit: 11dd993, github.com/apache/spark/pull/3787 + + [SPARK-4873][Streaming] Use `Future.zip` instead of `Future.flatMap`(for-loop) in WriteAheadLogBasedBlockHandler + zsxwing + 2014-12-24 19:49:41 -0800 + Commit: b4d0db8, github.com/apache/spark/pull/3721 + + SPARK-4297 [BUILD] Build warning fixes omnibus + Sean Owen + 2014-12-24 13:32:51 -0800 + Commit: 29fabb1, github.com/apache/spark/pull/3157 + + [SPARK-4881][Minor] Use SparkConf#getBoolean instead of get().toBoolean + Kousuke Saruta + 2014-12-23 19:14:34 -0800 + Commit: 199e59a, github.com/apache/spark/pull/3733 + + [SPARK-4860][pyspark][sql] speeding up `sample()` and `takeSample()` + jbencook , J. Benjamin Cook + 2014-12-23 17:46:24 -0800 + Commit: fd41eb9, github.com/apache/spark/pull/3764 + + [SPARK-4606] Send EOF to child JVM when there's no more data to read. + Marcelo Vanzin + 2014-12-23 16:02:59 -0800 + Commit: 7e2deb7, github.com/apache/spark/pull/3460 + + [SPARK-4671][Streaming]Do not replicate streaming block when WAL is enabled + jerryshao + 2014-12-23 15:45:53 -0800 + Commit: 3f5f4cc, github.com/apache/spark/pull/3534 + + [SPARK-4802] [streaming] Remove receiverInfo once receiver is de-registered + Ilayaperumal Gopinathan + 2014-12-23 15:14:54 -0800 + Commit: 10d69e9, github.com/apache/spark/pull/3647 + + [SPARK-4913] Fix incorrect event log path + Liang-Chi Hsieh + 2014-12-23 14:58:33 -0800 + Commit: 96281cd, github.com/apache/spark/pull/3755 + + [SPARK-4730][YARN] Warn against deprecated YARN settings + Andrew Or + 2014-12-23 14:28:36 -0800 + Commit: 27c5399, github.com/apache/spark/pull/3590 + + [SPARK-4914][Build] Cleans lib_managed before compiling with Hive 0.13.1 + Cheng Lian + 2014-12-23 12:54:20 -0800 + Commit: 395b771, github.com/apache/spark/pull/3756 + + [SPARK-4932] Add help comments in Analytics + Takeshi Yamamuro + 2014-12-23 12:39:41 -0800 + Commit: 9c251c5, github.com/apache/spark/pull/3775 + + [SPARK-4834] [standalone] Clean up application files after app finishes. + Marcelo Vanzin + 2014-12-23 12:02:08 -0800 + Commit: dd15536, github.com/apache/spark/pull/3705 + + [SPARK-4931][Yarn][Docs] Fix the format of running-on-yarn.md + zsxwing + 2014-12-23 11:18:06 -0800 + Commit: 2d215ae, github.com/apache/spark/pull/3774 + + [SPARK-4890] Ignore downloaded EC2 libs + Nicholas Chammas + 2014-12-23 11:12:16 -0800 + Commit: 2823c7f, github.com/apache/spark/pull/3770 + + [Docs] Minor typo fixes + Nicholas Chammas + 2014-12-22 22:54:32 -0800 + Commit: 0e532cc, github.com/apache/spark/pull/3772 + + [SPARK-4907][MLlib] Inconsistent loss and gradient in LeastSquaresGradient compared with R + DB Tsai + 2014-12-22 16:42:55 -0800 + Commit: a96b727, github.com/apache/spark/pull/3746 + + [SPARK-4818][Core] Add 'iterator' to reduce memory consumed by join + zsxwing + 2014-12-22 14:26:28 -0800 + Commit: c233ab3, github.com/apache/spark/pull/3671 + + [SPARK-4920][UI]:current spark version in UI is not striking. + genmao.ygm + 2014-12-22 14:14:39 -0800 + Commit: de9d7d2, github.com/apache/spark/pull/3763 + + [Minor] Fix scala doc + Liang-Chi Hsieh + 2014-12-22 14:13:31 -0800 + Commit: a61aa66, github.com/apache/spark/pull/3751 + + [SPARK-4864] Add documentation to Netty-based configs + Aaron Davidson + 2014-12-22 13:09:22 -0800 + Commit: fbca6b6, github.com/apache/spark/pull/3713 + + [SPARK-4079] [CORE] Consolidates Errors if a CompressionCodec is not available + Kostas Sakellis + 2014-12-22 13:07:01 -0800 + Commit: 7c0ed13, github.com/apache/spark/pull/3119 + + SPARK-4447. Remove layers of abstraction in YARN code no longer needed after dropping yarn-alpha + Sandy Ryza + 2014-12-22 12:23:43 -0800 + Commit: d62da64, github.com/apache/spark/pull/3652 + + [SPARK-4733] Add missing prameter comments in ShuffleDependency + Takeshi Yamamuro + 2014-12-22 12:19:23 -0800 + Commit: fb8e85e, github.com/apache/spark/pull/3594 + + [Minor] Improve some code in BroadcastTest for short + carlmartin + 2014-12-22 12:13:53 -0800 + Commit: 1d9788e, github.com/apache/spark/pull/3750 + + [SPARK-4883][Shuffle] Add a name to the directoryCleaner thread + zsxwing + 2014-12-22 12:11:36 -0800 + Commit: 8773705, github.com/apache/spark/pull/3734 + + [SPARK-4870] Add spark version to driver log + Zhang, Liye + 2014-12-22 11:36:49 -0800 + Commit: 39272c8, github.com/apache/spark/pull/3717 + + [SPARK-4915][YARN] Fix classname to be specified for external shuffle service. + Tsuyoshi Ozawa + 2014-12-22 11:28:05 -0800 + Commit: 96606f6, github.com/apache/spark/pull/3757 + + [SPARK-4918][Core] Reuse Text in saveAsTextFile + zsxwing + 2014-12-22 11:20:00 -0800 + Commit: 93b2f3a, github.com/apache/spark/pull/3762 + + [SPARK-2075][Core] Make the compiler generate same bytes code for Hadoop 1.+ and Hadoop 2.+ + zsxwing + 2014-12-21 22:10:19 -0800 + Commit: 6ee6aa7, github.com/apache/spark/pull/3740 + + SPARK-4910 [CORE] build failed (use of FileStatus.isFile in Hadoop 1.x) + Sean Owen + 2014-12-21 13:16:57 -0800 + Commit: c6a3c0d, github.com/apache/spark/pull/3754 + + [Minor] Build Failed: value defaultProperties not found + huangzhaowei + 2014-12-19 23:32:56 -0800 + Commit: a764960, github.com/apache/spark/pull/3749 + + [SPARK-4140] Document dynamic allocation + Andrew Or , Tsuyoshi Ozawa + 2014-12-19 19:36:20 -0800 + Commit: 15c03e1, github.com/apache/spark/pull/3731 + + [SPARK-4831] Do not include SPARK_CLASSPATH if empty + Daniel Darabos + 2014-12-19 19:32:39 -0800 + Commit: 7cb3f54, github.com/apache/spark/pull/3678 + + SPARK-2641: Passing num executors to spark arguments from properties file + Kanwaljit Singh + 2014-12-19 19:25:39 -0800 + Commit: 1d64812, github.com/apache/spark/pull/1657 + + [SPARK-3060] spark-shell.cmd doesn't accept application options in Windows OS + Masayoshi TSUZUKI + 2014-12-19 19:19:53 -0800 + Commit: 8d93247, github.com/apache/spark/pull/3350 + + change signature of example to match released code + Eran Medan + 2014-12-19 18:29:36 -0800 + Commit: c25c669, github.com/apache/spark/pull/3747 + + [SPARK-2261] Make event logger use a single file. + Marcelo Vanzin + 2014-12-19 18:21:15 -0800 + Commit: 4564519, github.com/apache/spark/pull/1222 + + [SPARK-4890] Upgrade Boto to 2.34.0; automatically download Boto from PyPi instead of packaging it + Josh Rosen + 2014-12-19 17:02:37 -0800 + Commit: c28083f, github.com/apache/spark/pull/3737 + + [SPARK-4896] don’t redundantly overwrite executor JAR deps + Ryan Williams + 2014-12-19 15:24:41 -0800 + Commit: 7981f96, github.com/apache/spark/pull/2848 + + [SPARK-4889] update history server example cmds + Ryan Williams + 2014-12-19 13:56:04 -0800 + Commit: cdb2c64, github.com/apache/spark/pull/3736 + + Small refactoring to pass SparkEnv into Executor rather than creating SparkEnv in Executor. + Reynold Xin + 2014-12-19 12:51:12 -0800 + Commit: 336cd34, github.com/apache/spark/pull/3738 + + [Build] Remove spark-staging-1038 + scwf + 2014-12-19 08:29:38 -0800 + Commit: 8e253eb, github.com/apache/spark/pull/3743 + + [SPARK-4901] [SQL] Hot fix for ByteWritables.copyBytes + Cheng Hao + 2014-12-19 08:04:41 -0800 + Commit: 5479450, github.com/apache/spark/pull/3742 + + SPARK-3428. TaskMetrics for running tasks is missing GC time metrics + Sandy Ryza + 2014-12-18 22:40:44 -0800 + Commit: 283263f, github.com/apache/spark/pull/3684 + + [SPARK-4674] Refactor getCallSite + Liang-Chi Hsieh + 2014-12-18 21:41:02 -0800 + Commit: d7fc69a, github.com/apache/spark/pull/3532 + + [SPARK-4728][MLLib] Add exponential, gamma, and log normal sampling to MLlib da... + RJ Nowling + 2014-12-18 21:00:49 -0800 + Commit: ee1fb97, github.com/apache/spark/pull/3680 + + [SPARK-4861][SQL] Refactory command in spark sql + wangfei , scwf + 2014-12-18 20:24:56 -0800 + Commit: c3d91da, github.com/apache/spark/pull/3712 + + [SPARK-4573] [SQL] Add SettableStructObjectInspector support in "wrap" function + Cheng Hao + 2014-12-18 20:21:52 -0800 + Commit: ae9f128, github.com/apache/spark/pull/3429 + + [SPARK-2554][SQL] Supporting SumDistinct partial aggregation + ravipesala + 2014-12-18 20:19:10 -0800 + Commit: 7687415, github.com/apache/spark/pull/3348 + + [SPARK-4693] [SQL] PruningPredicates may be wrong if predicates contains an empty AttributeSet() references + YanTangZhai , yantangzhai + 2014-12-18 20:13:46 -0800 + Commit: e7de7e5, github.com/apache/spark/pull/3556 + + [SPARK-4756][SQL] FIX: sessionToActivePool grow infinitely, even as sessions expire + guowei2 + 2014-12-18 20:10:23 -0800 + Commit: 22ddb6e, github.com/apache/spark/pull/3617 + + [SPARK-3928][SQL] Support wildcard matches on Parquet files. + Thu Kyaw + 2014-12-18 20:08:32 -0800 + Commit: b68bc6d, github.com/apache/spark/pull/3407 + + [SPARK-2663] [SQL] Support the Grouping Set + Cheng Hao + 2014-12-18 18:58:29 -0800 + Commit: f728e0f, github.com/apache/spark/pull/1567 + + [SPARK-4754] Refactor SparkContext into ExecutorAllocationClient + Andrew Or + 2014-12-18 17:37:42 -0800 + Commit: 9804a75, github.com/apache/spark/pull/3614 + + [SPARK-4837] NettyBlockTransferService should use spark.blockManager.port config + Aaron Davidson + 2014-12-18 16:43:16 -0800 + Commit: 105293a, github.com/apache/spark/pull/3688 + + SPARK-4743 - Use SparkEnv.serializer instead of closureSerializer in aggregateByKey and foldByKey + Ivan Vergiliev + 2014-12-18 16:29:36 -0800 + Commit: f9f58b9, github.com/apache/spark/pull/3605 + + [SPARK-4884]: Improve Partition docs + Madhu Siddalingaiah + 2014-12-18 16:00:53 -0800 + Commit: d5a596d, github.com/apache/spark/pull/3722 + + [SPARK-4880] remove spark.locality.wait in Analytics + Ernest + 2014-12-18 15:42:26 -0800 + Commit: a7ed6f3, github.com/apache/spark/pull/3730 + + [SPARK-4887][MLlib] Fix a bad unittest in LogisticRegressionSuite + DB Tsai + 2014-12-18 13:55:49 -0800 + Commit: 59a49db, github.com/apache/spark/pull/3735 + + [SPARK-3607] ConnectionManager threads.max configs on the thread pools don't work + Ilya Ganelin + 2014-12-18 12:53:18 -0800 + Commit: 3720057, github.com/apache/spark/pull/3664 + + Add mesos specific configurations into doc + Timothy Chen + 2014-12-18 12:15:53 -0800 + Commit: d9956f8, github.com/apache/spark/pull/3349 + + SPARK-3779. yarn spark.yarn.applicationMaster.waitTries config should be... + Sandy Ryza + 2014-12-18 12:19:07 -0600 + Commit: 253b72b, github.com/apache/spark/pull/3471 + + [SPARK-4461][YARN] pass extra java options to yarn application master + Zhan Zhang + 2014-12-18 10:01:46 -0600 + Commit: 3b76469, github.com/apache/spark/pull/3409 + + [SPARK-4822] Use sphinx tags for Python doc annotations + lewuathe + 2014-12-17 17:31:24 -0800 + Commit: 3cd5161, github.com/apache/spark/pull/3685 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-12-17 15:50:10 -0800 + Commit: ca12608, github.com/apache/spark/pull/3137 + + [SPARK-3891][SQL] Add array support to percentile, percentile_approx and constant inspectors support + Venkata Ramana G , Venkata Ramana Gollamudi + 2014-12-17 15:41:35 -0800 + Commit: f33d550, github.com/apache/spark/pull/2802 + + [SPARK-4856] [SQL] NullType instead of StringType when sampling against empty string or nul... + Cheng Hao + 2014-12-17 15:01:59 -0800 + Commit: 8d0d2a6, github.com/apache/spark/pull/3708 + + [HOTFIX][SQL] Fix parquet filter suite + Michael Armbrust + 2014-12-17 14:27:02 -0800 + Commit: 19c0faa, github.com/apache/spark/pull/3727 + + [SPARK-4821] [mllib] [python] [docs] Fix for pyspark.mllib.rand doc + Joseph K. Bradley + 2014-12-17 14:12:46 -0800 + Commit: affc3f4, github.com/apache/spark/pull/3669 + + [SPARK-3739] [SQL] Update the split num base on block size for table scanning + Cheng Hao + 2014-12-17 13:39:36 -0800 + Commit: 636d9fc, github.com/apache/spark/pull/2589 + + [SPARK-4755] [SQL] sqrt(negative value) should return null + Daoyuan Wang + 2014-12-17 12:51:27 -0800 + Commit: 902e4d5, github.com/apache/spark/pull/3616 + + [SPARK-4493][SQL] Don't pushdown Eq, NotEq, Lt, LtEq, Gt and GtEq predicates with nulls for Parquet + Cheng Lian + 2014-12-17 12:48:04 -0800 + Commit: 6277135, github.com/apache/spark/pull/3367 + + [SPARK-3698][SQL] Fix case insensitive resolution of GetField. + Michael Armbrust + 2014-12-17 12:43:51 -0800 + Commit: 7ad579e, github.com/apache/spark/pull/3724 + + [SPARK-4694]Fix HiveThriftServer2 cann't stop In Yarn HA mode. + carlmartin + 2014-12-17 12:24:03 -0800 + Commit: 4782def, github.com/apache/spark/pull/3576 + + [SPARK-4625] [SQL] Add sort by for DSL & SimpleSqlParser + Cheng Hao + 2014-12-17 12:01:57 -0800 + Commit: 5fdcbdc, github.com/apache/spark/pull/3481 + + [SPARK-4595][Core] Fix MetricsServlet not work issue + Saisai Shao , Josh Rosen , jerryshao + 2014-12-17 11:47:44 -0800 + Commit: cf50631, github.com/apache/spark/pull/3444 + + [HOTFIX] Fix RAT exclusion for known_translations file + Josh Rosen + 2014-12-16 23:00:25 -0800 + Commit: 3d0c37b, github.com/apache/spark/pull/3719 + + [Release] Update contributors list format and sort it + Andrew Or + 2014-12-16 22:11:03 -0800 + Commit: 4e1112e + + [SPARK-4618][SQL] Make foreign DDL commands options case-insensitive + scwf , wangfei + 2014-12-16 21:26:36 -0800 + Commit: 6069880, github.com/apache/spark/pull/3470 + + [SPARK-4866] support StructType as key in MapType + Davies Liu + 2014-12-16 21:23:28 -0800 + Commit: ec5c427, github.com/apache/spark/pull/3714 + + [SPARK-4375] [SQL] Add 0 argument support for udf + Cheng Hao + 2014-12-16 21:21:11 -0800 + Commit: 770d815, github.com/apache/spark/pull/3595 + + [SPARK-4720][SQL] Remainder should also return null if the divider is 0. + Takuya UESHIN + 2014-12-16 21:19:57 -0800 + Commit: ddc7ba3, github.com/apache/spark/pull/3581 + + [SPARK-4744] [SQL] Short circuit evaluation for AND & OR in CodeGen + Cheng Hao + 2014-12-16 21:18:39 -0800 + Commit: 0aa834a, github.com/apache/spark/pull/3606 + + [SPARK-4798][SQL] A new set of Parquet testing API and test suites + Cheng Lian + 2014-12-16 21:16:03 -0800 + Commit: 3b395e1, github.com/apache/spark/pull/3644 + + [Release] Cache known author translations locally + Andrew Or + 2014-12-16 19:28:43 -0800 + Commit: b85044e + + [Release] Major improvements to generate contributors script + Andrew Or + 2014-12-16 17:55:27 -0800 + Commit: 6f80b74 + + [SPARK-4269][SQL] make wait time configurable in BroadcastHashJoin + Jacky Li + 2014-12-16 15:34:59 -0800 + Commit: fa66ef6, github.com/apache/spark/pull/3133 + + [SPARK-4827][SQL] Fix resolution of deeply nested Project(attr, Project(Star,...)). + Michael Armbrust + 2014-12-16 15:31:19 -0800 + Commit: a66c23e, github.com/apache/spark/pull/3674 + + [SPARK-4483][SQL]Optimization about reduce memory costs during the HashOuterJoin + tianyi , tianyi + 2014-12-16 15:22:29 -0800 + Commit: 30f6b85, github.com/apache/spark/pull/3375 + + [SPARK-4527][SQl]Add BroadcastNestedLoopJoin operator selection testsuite + wangxiaojing + 2014-12-16 14:45:56 -0800 + Commit: ea1315e, github.com/apache/spark/pull/3395 + + SPARK-4767: Add support for launching in a specified placement group to spark_ec2 + Holden Karau + 2014-12-16 14:37:04 -0800 + Commit: b0dfdbd, github.com/apache/spark/pull/3623 + + [SPARK-4812][SQL] Fix the initialization issue of 'codegenEnabled' + zsxwing + 2014-12-16 14:13:40 -0800 + Commit: 6530243, github.com/apache/spark/pull/3660 + + [SPARK-4847][SQL]Fix "extraStrategies cannot take effect in SQLContext" issue + jerryshao + 2014-12-16 14:08:28 -0800 + Commit: dc8280d, github.com/apache/spark/pull/3698 + + [DOCS][SQL] Add a Note on jsonFile having separate JSON objects per line + Peter Vandenabeele + 2014-12-16 13:57:55 -0800 + Commit: 1a9e35e, github.com/apache/spark/pull/3517 + + [SQL] SPARK-4700: Add HTTP protocol spark thrift server + Judy Nash , judynash + 2014-12-16 12:37:26 -0800 + Commit: 17688d1, github.com/apache/spark/pull/3672 + + [SPARK-3405] add subnet-id and vpc-id options to spark_ec2.py + Mike Jennings , Mike Jennings + 2014-12-16 12:13:21 -0800 + Commit: d12c071, github.com/apache/spark/pull/2872 + + [SPARK-4855][mllib] testing the Chi-squared hypothesis test + jbencook + 2014-12-16 11:37:23 -0800 + Commit: cb48447, github.com/apache/spark/pull/3679 + + [SPARK-4437] update doc for WholeCombineFileRecordReader + Davies Liu , Josh Rosen + 2014-12-16 11:19:36 -0800 + Commit: ed36200, github.com/apache/spark/pull/3301 + + [SPARK-4841] fix zip with textFile() + Davies Liu + 2014-12-15 22:58:26 -0800 + Commit: c246b95, github.com/apache/spark/pull/3706 + + [SPARK-4792] Add error message when making local dir unsuccessfully + meiyoula <1039320815@qq.com> + 2014-12-15 22:30:18 -0800 + Commit: c762877, github.com/apache/spark/pull/3635 + + SPARK-4814 [CORE] Enable assertions in SBT, Maven tests / AssertionError from Hive's LazyBinaryInteger + Sean Owen + 2014-12-15 17:12:05 -0800 + Commit: 81112e4, github.com/apache/spark/pull/3692 + + [Minor][Core] fix comments in MapOutputTracker + wangfei + 2014-12-15 16:46:21 -0800 + Commit: 5c24759, github.com/apache/spark/pull/3700 + + SPARK-785 [CORE] ClosureCleaner not invoked on most PairRDDFunctions + Sean Owen + 2014-12-15 16:06:15 -0800 + Commit: 2a28bc6, github.com/apache/spark/pull/3690 + + [SPARK-4668] Fix some documentation typos. + Ryan Williams + 2014-12-15 14:52:17 -0800 + Commit: 8176b7a, github.com/apache/spark/pull/3523 + + [SPARK-1037] The name of findTaskFromList & findTask in TaskSetManager.scala is confusing + Ilya Ganelin + 2014-12-15 14:51:15 -0800 + Commit: 38703bb, github.com/apache/spark/pull/3665 + + [SPARK-4826] Fix generation of temp file names in WAL tests + Josh Rosen + 2014-12-15 14:33:43 -0800 + Commit: f6b8591, github.com/apache/spark/pull/3695. + + [SPARK-4494][mllib] IDFModel.transform() add support for single vector + Yuu ISHIKAWA + 2014-12-15 13:44:15 -0800 + Commit: 8098fab, github.com/apache/spark/pull/3603 + + HOTFIX: Disabling failing block manager test + Patrick Wendell + 2014-12-15 10:54:45 -0800 + Commit: 4c06738 + + fixed spelling errors in documentation + Peter Klipfel + 2014-12-14 00:01:16 -0800 + Commit: 2a2983f, github.com/apache/spark/pull/3691 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-12-11 23:38:40 -0800 + Commit: ef84dab, github.com/apache/spark/pull/3488 + + [SPARK-4829] [SQL] add rule to fold count(expr) if expr is not null + Daoyuan Wang + 2014-12-11 22:56:42 -0800 + Commit: 41a3f93, github.com/apache/spark/pull/3676 + + [SPARK-4742][SQL] The name of Parquet File generated by AppendingParquetOutputFormat should be zero padded + Sasaki Toru + 2014-12-11 22:54:21 -0800 + Commit: 8091dd6, github.com/apache/spark/pull/3602 + + [SPARK-4825] [SQL] CTAS fails to resolve when created using saveAsTable + Cheng Hao + 2014-12-11 22:51:49 -0800 + Commit: 0abbff2, github.com/apache/spark/pull/3673 + + [SQL] enable empty aggr test case + Daoyuan Wang + 2014-12-11 22:50:18 -0800 + Commit: cbb634a, github.com/apache/spark/pull/3445 + + [SPARK-4828] [SQL] sum and avg on empty table should always return null + Daoyuan Wang + 2014-12-11 22:49:27 -0800 + Commit: acb3be6, github.com/apache/spark/pull/3675 + + [SQL] Remove unnecessary case in HiveContext.toHiveString + scwf + 2014-12-11 22:48:03 -0800 + Commit: d8cf678, github.com/apache/spark/pull/3563 + + [SPARK-4293][SQL] Make Cast be able to handle complex types. + Takuya UESHIN + 2014-12-11 22:45:25 -0800 + Commit: 3344803, github.com/apache/spark/pull/3150 + + [SPARK-4639] [SQL] Pass maxIterations in as a parameter in Analyzer + Jacky Li + 2014-12-11 22:44:27 -0800 + Commit: c152dde, github.com/apache/spark/pull/3499 + + [SPARK-4662] [SQL] Whitelist more unittest + Cheng Hao + 2014-12-11 22:43:02 -0800 + Commit: a7f07f5, github.com/apache/spark/pull/3522 + + [SPARK-4713] [SQL] SchemaRDD.unpersist() should not raise exception if it is not persisted + Cheng Hao + 2014-12-11 22:41:36 -0800 + Commit: bf40cf8, github.com/apache/spark/pull/3572 + + [SPARK-4806] Streaming doc update for 1.2 + Tathagata Das , Josh Rosen , Josh Rosen + 2014-12-11 06:21:23 -0800 + Commit: b004150, github.com/apache/spark/pull/3653 + + [SPARK-4791] [sql] Infer schema from case class with multiple constructors + Joseph K. Bradley + 2014-12-10 23:41:15 -0800 + Commit: 2a5b5fd, github.com/apache/spark/pull/3646 + + [CORE]codeStyle: uniform ConcurrentHashMap define in StorageLevel.scala with other places + Zhang, Liye + 2014-12-10 20:44:59 -0800 + Commit: 57d37f9, github.com/apache/spark/pull/2793 + + SPARK-3526 Add section about data locality to the tuning guide + Andrew Ash + 2014-12-10 15:01:15 -0800 + Commit: 652b781, github.com/apache/spark/pull/2519 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-12-10 14:41:16 -0800 + Commit: 36bdb5b, github.com/apache/spark/pull/2883 + + [SPARK-4759] Fix driver hanging from coalescing partitions + Andrew Or + 2014-12-10 14:27:53 -0800 + Commit: 4f93d0c, github.com/apache/spark/pull/3633 + + [SPARK-4569] Rename 'externalSorting' in Aggregator + Ilya Ganelin + 2014-12-10 14:19:37 -0800 + Commit: 447ae2d, github.com/apache/spark/pull/3666 + + [SPARK-4793] [Deploy] ensure .jar at end of line + Daoyuan Wang + 2014-12-10 13:29:27 -0800 + Commit: e230da1, github.com/apache/spark/pull/3641 + + [SPARK-4215] Allow requesting / killing executors only in YARN mode + Andrew Or + 2014-12-10 12:48:24 -0800 + Commit: faa8fd8, github.com/apache/spark/pull/3615 + + [SPARK-4771][Docs] Document standalone cluster supervise mode + Andrew Or + 2014-12-10 12:41:36 -0800 + Commit: 5621283, github.com/apache/spark/pull/3627 + + [SPARK-4329][WebUI] HistoryPage pagenation + Kousuke Saruta + 2014-12-10 12:29:00 -0800 + Commit: 0fc637b, github.com/apache/spark/pull/3194 + + [SPARK-4161]Spark shell class path is not correctly set if "spark.driver.extraClassPath" is set in defaults.conf + GuoQiang Li + 2014-12-10 12:24:04 -0800 + Commit: 742e709, github.com/apache/spark/pull/3050 + + [SPARK-4772] Clear local copies of accumulators as soon as we're done with them + Nathan Kronenfeld + 2014-12-09 23:53:17 -0800 + Commit: 94b377f, github.com/apache/spark/pull/3570 + + [Minor] Use tag for help icon in web UI page header + Josh Rosen + 2014-12-09 23:47:05 -0800 + Commit: f79c1cf, github.com/apache/spark/pull/3659 + + Config updates for the new shuffle transport. + Reynold Xin + 2014-12-09 19:29:09 -0800 + Commit: 9bd9334, github.com/apache/spark/pull/3657 + + [SPARK-4740] Create multiple concurrent connections between two peer nodes in Netty. + Reynold Xin + 2014-12-09 17:49:59 -0800 + Commit: 2b9b726, github.com/apache/spark/pull/3625 + + SPARK-4805 [CORE] BlockTransferMessage.toByteArray() trips assertion + Sean Owen + 2014-12-09 16:38:27 -0800 + Commit: d8f84f2, github.com/apache/spark/pull/3650 + + SPARK-4567. Make SparkJobInfo and SparkStageInfo serializable + Sandy Ryza + 2014-12-09 16:26:07 -0800 + Commit: 5e4c06f, github.com/apache/spark/pull/3426 + + [SPARK-4714] BlockManager.dropFromMemory() should check whether block has been removed after synchronizing on BlockInfo instance. + hushan[胡珊] + 2014-12-09 15:11:20 -0800 + Commit: 30dca92, github.com/apache/spark/pull/3574 + + [SPARK-4765] Make GC time always shown in UI. + Kay Ousterhout + 2014-12-09 15:10:36 -0800 + Commit: 1f51106, github.com/apache/spark/pull/3622 + + [SPARK-4691][shuffle] Restructure a few lines in shuffle code + maji2014 + 2014-12-09 13:13:12 -0800 + Commit: b310744, github.com/apache/spark/pull/3553 + + [SPARK-874] adding a --wait flag + jbencook + 2014-12-09 12:16:19 -0800 + Commit: 61f1a70, github.com/apache/spark/pull/3567 + + SPARK-4338. [YARN] Ditch yarn-alpha. + Sandy Ryza + 2014-12-09 11:02:43 -0800 + Commit: 912563a, github.com/apache/spark/pull/3215 + + [SPARK-4785][SQL] Initilize Hive UDFs on the driver and serialize them with a wrapper + Cheng Hao , Cheng Lian + 2014-12-09 10:28:15 -0800 + Commit: 383c555, github.com/apache/spark/pull/3640 + + [SPARK-3154][STREAMING] Replace ConcurrentHashMap with mutable.HashMap and remove @volatile from 'stopped' + zsxwing + 2014-12-08 23:54:15 -0800 + Commit: bcb5cda, github.com/apache/spark/pull/3634 + + [SPARK-4769] [SQL] CTAS does not work when reading from temporary tables + Cheng Hao + 2014-12-08 17:39:12 -0800 + Commit: 51b1fe1, github.com/apache/spark/pull/3336 + + [SQL] remove unnecessary import in spark-sql + Jacky Li + 2014-12-08 17:27:46 -0800 + Commit: 9443843, github.com/apache/spark/pull/3630 + + SPARK-4770. [DOC] [YARN] spark.scheduler.minRegisteredResourcesRatio doc... + Sandy Ryza + 2014-12-08 16:28:36 -0800 + Commit: cda94d1, github.com/apache/spark/pull/3624 + + SPARK-3926 [CORE] Reopened: result of JavaRDD collectAsMap() is not serializable + Sean Owen + 2014-12-08 16:13:03 -0800 + Commit: e829bfa, github.com/apache/spark/pull/3587 + + [SPARK-4750] Dynamic allocation - synchronize kills + Andrew Or + 2014-12-08 16:02:33 -0800 + Commit: 65f929d, github.com/apache/spark/pull/3612 + + [SPARK-4774] [SQL] Makes HiveFromSpark more portable + Kostas Sakellis + 2014-12-08 15:44:18 -0800 + Commit: d6a972b, github.com/apache/spark/pull/3628 + + [SPARK-4764] Ensure that files are fetched atomically + Christophe Préaud + 2014-12-08 11:44:54 -0800 + Commit: ab2abcb, github.com/apache/spark/pull/2855 + + [SPARK-4620] Add unpersist in Graph and GraphImpl + Takeshi Yamamuro + 2014-12-07 19:42:02 -0800 + Commit: 8817fc7, github.com/apache/spark/pull/3476 + + [SPARK-4646] Replace Scala.util.Sorting.quickSort with Sorter(TimSort) in Spark + Takeshi Yamamuro + 2014-12-07 19:36:08 -0800 + Commit: 2e6b736, github.com/apache/spark/pull/3507 + + [SPARK-3623][GraphX] GraphX should support the checkpoint operation + GuoQiang Li + 2014-12-06 00:56:51 -0800 + Commit: e895e0c, github.com/apache/spark/pull/2631 + + Streaming doc : do you mean inadvertently? + CrazyJvm + 2014-12-05 13:42:13 -0800 + Commit: 6eb1b6f, github.com/apache/spark/pull/3620 + + [SPARK-4005][CORE] handle message replies in receive instead of in the individual private methods + Zhang, Liye + 2014-12-05 12:00:32 -0800 + Commit: 98a7d09, github.com/apache/spark/pull/2853 + + [SPARK-4761][SQL] Enables Kryo by default in Spark SQL Thrift server + Cheng Lian + 2014-12-05 10:27:40 -0800 + Commit: 6f61e1f, github.com/apache/spark/pull/3621 + + [SPARK-4753][SQL] Use catalyst for partition pruning in newParquet. + Michael Armbrust + 2014-12-04 22:25:21 -0800 + Commit: f5801e8, github.com/apache/spark/pull/3613 + + Revert "SPARK-2624 add datanucleus jars to the container in yarn-cluster" + Andrew Or + 2014-12-04 21:53:49 -0800 + Commit: fd85253 + + Revert "[HOT FIX] [YARN] Check whether `/lib` exists before listing its files" + Andrew Or + 2014-12-04 21:53:38 -0800 + Commit: 87437df + + [SPARK-4464] Description about configuration options need to be modified in docs. + Masayoshi TSUZUKI + 2014-12-04 19:33:02 -0800 + Commit: ca37903, github.com/apache/spark/pull/3329 + + Fix typo in Spark SQL docs. + Andy Konwinski + 2014-12-04 18:27:02 -0800 + Commit: 15cf3b0, github.com/apache/spark/pull/3611 + + [SPARK-4421] Wrong link in spark-standalone.html + Masayoshi TSUZUKI + 2014-12-04 18:14:36 -0800 + Commit: ddfc09c, github.com/apache/spark/pull/3279 + + [SPARK-4397] Move object RDD to the front of RDD.scala. + Reynold Xin + 2014-12-04 16:32:20 -0800 + Commit: ed92b47, github.com/apache/spark/pull/3580 + + [SPARK-4652][DOCS] Add docs about spark-git-repo option + lewuathe , Josh Rosen + 2014-12-04 15:14:36 -0800 + Commit: ab8177d, github.com/apache/spark/pull/3513 + + [SPARK-4459] Change groupBy type parameter from K to U + Saldanha + 2014-12-04 14:22:09 -0800 + Commit: 743a889, github.com/apache/spark/pull/3327 + + [SPARK-4745] Fix get_existing_cluster() function with multiple security groups + alexdebrie + 2014-12-04 14:13:59 -0800 + Commit: 794f3ae, github.com/apache/spark/pull/3596 + + [HOTFIX] Fixing two issues with the release script. + Patrick Wendell + 2014-12-04 12:11:41 -0800 + Commit: 8dae26f, github.com/apache/spark/pull/3608 + + [SPARK-4253] Ignore spark.driver.host in yarn-cluster and standalone-cluster modes + WangTaoTheTonic , WangTao + 2014-12-04 11:52:47 -0800 + Commit: 8106b1e, github.com/apache/spark/pull/3112 + + [SPARK-4683][SQL] Add a beeline.cmd to run on Windows + Cheng Lian + 2014-12-04 10:21:03 -0800 + Commit: 28c7aca, github.com/apache/spark/pull/3599 + + [FIX][DOC] Fix broken links in ml-guide.md + Xiangrui Meng + 2014-12-04 20:16:35 +0800 + Commit: 7e758d7, github.com/apache/spark/pull/3601 + + [SPARK-4575] [mllib] [docs] spark.ml pipelines doc + bug fixes + Joseph K. Bradley , jkbradley , Xiangrui Meng + 2014-12-04 17:00:06 +0800 + Commit: 469a6e5, github.com/apache/spark/pull/3588 + + [docs] Fix outdated comment in tuning guide + Joseph K. Bradley + 2014-12-04 00:59:32 -0800 + Commit: 529439b, github.com/apache/spark/pull/3592 + + [SQL] Minor: Avoid calling Seq#size in a loop + Aaron Davidson + 2014-12-04 00:58:42 -0800 + Commit: c6c7165, github.com/apache/spark/pull/3593 + + [SPARK-4685] Include all spark.ml and spark.mllib packages in JavaDoc's MLlib group + lewuathe , Xiangrui Meng + 2014-12-04 16:51:41 +0800 + Commit: 20bfea4, github.com/apache/spark/pull/3554 + + [SPARK-4719][API] Consolidate various narrow dep RDD classes with MapPartitionsRDD + Reynold Xin + 2014-12-04 00:45:57 -0800 + Commit: c3ad486, github.com/apache/spark/pull/3578 + + [SQL] remove unnecessary import + Jacky Li + 2014-12-04 00:43:55 -0800 + Commit: ed88db4, github.com/apache/spark/pull/3585 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-12-03 22:15:46 -0800 + Commit: 3cdae03, github.com/apache/spark/pull/1875 + + [Release] Correctly translate contributors name in release notes + Andrew Or + 2014-12-03 19:08:29 -0800 + Commit: a4dfb4e + + [SPARK-4580] [SPARK-4610] [mllib] [docs] Documentation for tree ensembles + DecisionTree API fix + Joseph K. Bradley , Joseph K. Bradley + 2014-12-04 09:57:50 +0800 + Commit: 657a888, github.com/apache/spark/pull/3461 + + [SPARK-4711] [mllib] [docs] Programming guide advice on choosing optimizer + Joseph K. Bradley + 2014-12-04 08:58:03 +0800 + Commit: 27ab0b8, github.com/apache/spark/pull/3569 + + [SPARK-4085] Propagate FetchFailedException when Spark fails to read local shuffle file. + Reynold Xin + 2014-12-03 16:28:24 -0800 + Commit: 1826372, github.com/apache/spark/pull/3579 + + [SPARK-4498][core] Don't transition ExecutorInfo to RUNNING until Driver adds Executor + Mark Hamstra + 2014-12-03 15:08:01 -0800 + Commit: 96b2785, github.com/apache/spark/pull/3550 + + [SPARK-4552][SQL] Avoid exception when reading empty parquet data through Hive + Michael Armbrust + 2014-12-03 14:13:35 -0800 + Commit: 513ef82, github.com/apache/spark/pull/3586 + + [HOT FIX] [YARN] Check whether `/lib` exists before listing its files + Andrew Or + 2014-12-03 13:56:23 -0800 + Commit: 90ec643, github.com/apache/spark/pull/3589 + + [SPARK-4642] Add description about spark.yarn.queue to running-on-YARN document. + Masayoshi TSUZUKI + 2014-12-03 13:16:24 -0800 + Commit: 692f493, github.com/apache/spark/pull/3500 + + [SPARK-4715][Core] Make sure tryToAcquire won't return a negative value + zsxwing + 2014-12-03 12:19:40 -0800 + Commit: edd3cd4, github.com/apache/spark/pull/3575 + + [SPARK-4701] Typo in sbt/sbt + Masayoshi TSUZUKI + 2014-12-03 12:08:00 -0800 + Commit: 96786e3, github.com/apache/spark/pull/3560 + + SPARK-2624 add datanucleus jars to the container in yarn-cluster + Jim Lim + 2014-12-03 11:16:02 -0800 + Commit: a975dc3, github.com/apache/spark/pull/3238 + + [SPARK-4717][MLlib] Optimize BLAS library to avoid de-reference multiple times in loop + DB Tsai + 2014-12-03 22:31:39 +0800 + Commit: d005429, github.com/apache/spark/pull/3577 + + [SPARK-4708][MLLib] Make k-mean runs two/three times faster with dense/sparse sample + DB Tsai + 2014-12-03 19:01:56 +0800 + Commit: 7fc49ed, github.com/apache/spark/pull/3565 + + [SPARK-4710] [mllib] Eliminate MLlib compilation warnings + Joseph K. Bradley + 2014-12-03 18:50:03 +0800 + Commit: 4ac2151, github.com/apache/spark/pull/3568 + + [SPARK-4397][Core] Change the 'since' value of '@deprecated' to '1.3.0' + zsxwing + 2014-12-03 02:05:17 -0800 + Commit: 8af551f, github.com/apache/spark/pull/3573 + + [SPARK-4672][Core]Checkpoint() should clear f to shorten the serialization chain + JerryLead , Lijie Xu + 2014-12-02 23:53:29 -0800 + Commit: 77be8b9, github.com/apache/spark/pull/3545 + + [SPARK-4672][GraphX]Non-transient PartitionsRDDs will lead to StackOverflow error + JerryLead , Lijie Xu + 2014-12-02 17:14:11 -0800 + Commit: 17c162f, github.com/apache/spark/pull/3544 + + [SPARK-4672][GraphX]Perform checkpoint() on PartitionsRDD to shorten the lineage + JerryLead , Lijie Xu + 2014-12-02 17:08:02 -0800 + Commit: fc0a147, github.com/apache/spark/pull/3549 + + [Release] Translate unknown author names automatically + Andrew Or + 2014-12-02 16:36:12 -0800 + Commit: 5da21f0 + + Minor nit style cleanup in GraphX. + Reynold Xin + 2014-12-02 14:40:26 -0800 + Commit: 2d4f6e7 + + [SPARK-4695][SQL] Get result using executeCollect + wangfei + 2014-12-02 14:30:44 -0800 + Commit: 3ae0cda, github.com/apache/spark/pull/3547 + + [SPARK-4670] [SQL] wrong symbol for bitwise not + Daoyuan Wang + 2014-12-02 14:25:12 -0800 + Commit: 1f5ddf1, github.com/apache/spark/pull/3528 + + [SPARK-4593][SQL] Return null when denominator is 0 + Daoyuan Wang + 2014-12-02 14:21:12 -0800 + Commit: f6df609, github.com/apache/spark/pull/3443 + + [SPARK-4676][SQL] JavaSchemaRDD.schema may throw NullType MatchError if sql has null + YanTangZhai , yantangzhai , Michael Armbrust + 2014-12-02 14:12:48 -0800 + Commit: 1066427, github.com/apache/spark/pull/3538 + + [SPARK-4663][sql]add finally to avoid resource leak + baishuo + 2014-12-02 12:12:03 -0800 + Commit: 69b6fed, github.com/apache/spark/pull/3526 + + [SPARK-4536][SQL] Add sqrt and abs to Spark SQL DSL + Kousuke Saruta + 2014-12-02 12:07:52 -0800 + Commit: e75e04f, github.com/apache/spark/pull/3401 + + Indent license header properly for interfaces.scala. + Reynold Xin + 2014-12-02 11:59:15 -0800 + Commit: b1f8fe3, github.com/apache/spark/pull/3552 + + [SPARK-4686] Link to allowed master URLs is broken + Kay Ousterhout + 2014-12-02 09:06:02 -0800 + Commit: d9a148b, github.com/apache/spark/pull/3542 + + [SPARK-4397][Core] Cleanup 'import SparkContext._' in core + zsxwing + 2014-12-02 00:18:41 -0800 + Commit: 6dfe38a, github.com/apache/spark/pull/3530 + + [SPARK-4611][MLlib] Implement the efficient vector norm + DB Tsai + 2014-12-02 11:40:43 +0800 + Commit: 64f3175, github.com/apache/spark/pull/3462 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-12-01 17:27:14 -0800 + Commit: b0a46d8, github.com/apache/spark/pull/1612 + + [SPARK-4268][SQL] Use #::: to get benefit from Stream in SqlLexical.allCaseVersions + zsxwing + 2014-12-01 16:39:54 -0800 + Commit: d3e02dd, github.com/apache/spark/pull/3132 + + [SPARK-4529] [SQL] support view with column alias + Daoyuan Wang + 2014-12-01 16:08:51 -0800 + Commit: 4df60a8, github.com/apache/spark/pull/3396 + + [SQL][DOC] Date type in SQL programming guide + Daoyuan Wang + 2014-12-01 14:03:57 -0800 + Commit: 5edbcbf, github.com/apache/spark/pull/3535 + + [SQL] Minor fix for doc and comment + wangfei + 2014-12-01 14:02:02 -0800 + Commit: 7b79957, github.com/apache/spark/pull/3533 + + [SPARK-4658][SQL] Code documentation issue in DDL of datasource API + ravipesala + 2014-12-01 13:31:27 -0800 + Commit: bc35381, github.com/apache/spark/pull/3516 + + [SPARK-4650][SQL] Supporting multi column support in countDistinct function like count(distinct c1,c2..) in Spark SQL + ravipesala , Michael Armbrust + 2014-12-01 13:26:44 -0800 + Commit: 6a9ff19, github.com/apache/spark/pull/3511 + + [SPARK-4358][SQL] Let BigDecimal do checking type compatibility + Liang-Chi Hsieh + 2014-12-01 13:17:56 -0800 + Commit: b57365a, github.com/apache/spark/pull/3208 + + [SQL] add @group tab in limit() and count() + Jacky Li + 2014-12-01 13:12:30 -0800 + Commit: bafee67, github.com/apache/spark/pull/3458 + + [SPARK-4258][SQL][DOC] Documents spark.sql.parquet.filterPushdown + Cheng Lian + 2014-12-01 13:09:51 -0800 + Commit: 5db8dca, github.com/apache/spark/pull/3440 + + Documentation: add description for repartitionAndSortWithinPartitions + Madhu Siddalingaiah + 2014-12-01 08:45:34 -0800 + Commit: 2b233f5, github.com/apache/spark/pull/3390 + + [SPARK-4661][Core] Minor code and docs cleanup + zsxwing + 2014-12-01 00:35:01 -0800 + Commit: 30a86ac, github.com/apache/spark/pull/3521 + + [SPARK-4664][Core] Throw an exception when spark.akka.frameSize > 2047 + zsxwing + 2014-12-01 00:32:54 -0800 + Commit: 1d238f2, github.com/apache/spark/pull/3527 + + SPARK-2192 [BUILD] Examples Data Not in Binary Distribution + Sean Owen + 2014-12-01 16:31:04 +0800 + Commit: 6384f42, github.com/apache/spark/pull/3480 + + Fix wrong file name pattern in .gitignore + Kousuke Saruta + 2014-12-01 00:29:28 -0800 + Commit: 97eb6d7, github.com/apache/spark/pull/3529 + + [SPARK-4632] version update + Prabeesh K + 2014-11-30 20:51:53 -0800 + Commit: 5e7a6dc, github.com/apache/spark/pull/3495 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-11-30 20:51:13 -0800 + Commit: 06dc1b1, github.com/apache/spark/pull/2915 + + [DOC] Fixes formatting typo in SQL programming guide + Cheng Lian + 2014-11-30 19:04:07 -0800 + Commit: 2a4d389, github.com/apache/spark/pull/3498 + + [SPARK-4656][Doc] Typo in Programming Guide markdown + lewuathe + 2014-11-30 17:18:50 -0800 + Commit: a217ec5, github.com/apache/spark/pull/3412 + + [SPARK-4623]Add the some error infomation if using spark-sql in yarn-cluster mode + carlmartin , huangzhaowei + 2014-11-30 16:19:41 -0800 + Commit: aea7a99, github.com/apache/spark/pull/3479 + + SPARK-2143 [WEB UI] Add Spark version to UI footer + Sean Owen + 2014-11-30 11:40:08 -0800 + Commit: 048ecca, github.com/apache/spark/pull/3410 + + [DOCS][BUILD] Add instruction to use change-version-to-2.11.sh in 'Building for Scala 2.11'. + Takuya UESHIN + 2014-11-30 00:10:31 -0500 + Commit: 0fcd24c, github.com/apache/spark/pull/3361 + + SPARK-4507: PR merge script should support closing multiple JIRA tickets + Takayuki Hasegawa + 2014-11-29 23:12:10 -0500 + Commit: 4316a7b, github.com/apache/spark/pull/3428 + + [SPARK-4505][Core] Add a ClassTag parameter to CompactBuffer[T] + zsxwing + 2014-11-29 20:23:08 -0500 + Commit: c062224, github.com/apache/spark/pull/3378 + + [SPARK-4057] Use -agentlib instead of -Xdebug in sbt-launch-lib.bash for debugging + Kousuke Saruta + 2014-11-29 20:14:14 -0500 + Commit: 938dc14, github.com/apache/spark/pull/2904 + + Include the key name when failing on an invalid value. + Stephen Haberman + 2014-11-29 20:12:05 -0500 + Commit: 95290bf, github.com/apache/spark/pull/3514 + + [SPARK-3398] [SPARK-4325] [EC2] Use EC2 status checks. + Nicholas Chammas + 2014-11-29 00:31:06 -0800 + Commit: 317e114, github.com/apache/spark/pull/3195 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-11-29 00:24:35 -0500 + Commit: 047ff57, github.com/apache/spark/pull/3451 + + [SPARK-4597] Use proper exception and reset variable in Utils.createTempDir() + Liang-Chi Hsieh + 2014-11-28 18:04:05 -0800 + Commit: 49fe879, github.com/apache/spark/pull/3449 + + SPARK-1450 [EC2] Specify the default zone in the EC2 script help + Sean Owen + 2014-11-28 17:43:38 -0500 + Commit: 48223d8, github.com/apache/spark/pull/3454 + + [SPARK-4584] [yarn] Remove security manager from Yarn AM. + Marcelo Vanzin + 2014-11-28 15:15:30 -0500 + Commit: 915f8ee, github.com/apache/spark/pull/3484 + + [SPARK-4193][BUILD] Disable doclint in Java 8 to prevent from build error. + Takuya UESHIN + 2014-11-28 13:00:15 -0500 + Commit: e464f0a, github.com/apache/spark/pull/3058 + + [SPARK-4643] [Build] Remove unneeded staging repositories from build + Daoyuan Wang + 2014-11-28 12:41:38 -0500 + Commit: 53ed7f1, github.com/apache/spark/pull/3504 + + Delete unnecessary function + KaiXinXiaoLei + 2014-11-28 12:34:07 -0500 + Commit: 052e658, github.com/apache/spark/pull/3224 + + [SPARK-4645][SQL] Disables asynchronous execution in Hive 0.13.1 HiveThriftServer2 + Cheng Lian + 2014-11-28 11:42:40 -0500 + Commit: 5b99bf2, github.com/apache/spark/pull/3506 + + [SPARK-4619][Storage]delete redundant time suffix + maji2014 + 2014-11-28 00:36:22 -0800 + Commit: ceb6281, github.com/apache/spark/pull/3475 + + [SPARK-4613][Core] Java API for JdbcRDD + Cheng Lian + 2014-11-27 18:01:14 -0800 + Commit: 120a350, github.com/apache/spark/pull/3478 + + [SPARK-4626] Kill a task only if the executorId is (still) registered with the scheduler + roxchkplusony + 2014-11-27 15:54:40 -0800 + Commit: 84376d3, github.com/apache/spark/pull/3483 + + SPARK-4170 [CORE] Closure problems when running Scala app that "extends App" + Sean Owen + 2014-11-27 09:03:17 -0800 + Commit: 5d7fe17, github.com/apache/spark/pull/3497 + + [Release] Automate generation of contributors list + Andrew Or + 2014-11-26 23:16:23 -0800 + Commit: c86e9bc + + [SPARK-732][SPARK-3628][CORE][RESUBMIT] eliminate duplicate update on accmulator + CodingCat + 2014-11-26 16:52:04 -0800 + Commit: 5af53ad, github.com/apache/spark/pull/2524 + + [SPARK-4614][MLLIB] Slight API changes in Matrix and Matrices + Xiangrui Meng + 2014-11-26 08:22:50 -0800 + Commit: 561d31d, github.com/apache/spark/pull/3468 + + Removing confusing TripletFields + Joseph E. Gonzalez + 2014-11-26 00:55:28 -0800 + Commit: 288ce58, github.com/apache/spark/pull/3472 + + [SPARK-4612] Reduce task latency and increase scheduling throughput by making configuration initialization lazy + Tathagata Das + 2014-11-25 23:15:58 -0800 + Commit: e7f4d25, github.com/apache/spark/pull/3463 + + [SPARK-4516] Avoid allocating Netty PooledByteBufAllocators unnecessarily + Aaron Davidson + 2014-11-26 00:32:45 -0500 + Commit: 346bc17, github.com/apache/spark/pull/3465 + + [SPARK-4516] Cap default number of Netty threads at 8 + Aaron Davidson + 2014-11-25 23:57:04 -0500 + Commit: f5f2d27, github.com/apache/spark/pull/3469 + + [SPARK-4604][MLLIB] make MatrixFactorizationModel public + Xiangrui Meng + 2014-11-25 20:11:40 -0800 + Commit: b5fb141, github.com/apache/spark/pull/3459 + + [HOTFIX]: Adding back without-hive dist + Patrick Wendell + 2014-11-25 23:10:19 -0500 + Commit: 4d95526 + + [SPARK-4583] [mllib] LogLoss for GradientBoostedTrees fix + doc updates + Joseph K. Bradley + 2014-11-25 20:10:15 -0800 + Commit: c251fd7, github.com/apache/spark/pull/3439 + + [Spark-4509] Revert EC2 tag-based cluster membership patch + Xiangrui Meng + 2014-11-25 16:07:09 -0800 + Commit: 7eba0fb, github.com/apache/spark/pull/3453 + + Fix SPARK-4471: blockManagerIdFromJson function throws exception while B... + hushan[胡珊] + 2014-11-25 15:51:08 -0800 + Commit: 9bdf5da, github.com/apache/spark/pull/3340 + + [SPARK-4546] Improve HistoryServer first time user experience + Andrew Or + 2014-11-25 15:48:02 -0800 + Commit: 9afcbe4, github.com/apache/spark/pull/3411 + + [SPARK-4592] Avoid duplicate worker registrations in standalone mode + Andrew Or + 2014-11-25 15:46:26 -0800 + Commit: 1b2ab1c, github.com/apache/spark/pull/3447 + + [SPARK-4196][SPARK-4602][Streaming] Fix serialization issue in PairDStreamFunctions.saveAsNewAPIHadoopFiles + Tathagata Das + 2014-11-25 14:16:27 -0800 + Commit: 8838ad7, github.com/apache/spark/pull/3457 + + [SPARK-4581][MLlib] Refactorize StandardScaler to improve the transformation performance + DB Tsai + 2014-11-25 11:07:11 -0800 + Commit: bf1a6aa, github.com/apache/spark/pull/3435 + + [SPARK-4601][Streaming] Set correct call site for streaming jobs so that it is displayed correctly on the Spark UI + Tathagata Das + 2014-11-25 06:50:36 -0800 + Commit: 69cd53e, github.com/apache/spark/pull/3455 + + [SPARK-4344][DOCS] adding documentation on spark.yarn.user.classpath.first + arahuja + 2014-11-25 08:23:41 -0600 + Commit: d240760, github.com/apache/spark/pull/3209 + + [SPARK-4381][Streaming]Add warning log when user set spark.master to local in Spark Streaming and there's no job executed + jerryshao + 2014-11-25 05:36:29 -0800 + Commit: fef27b2, github.com/apache/spark/pull/3244 + + [SPARK-4535][Streaming] Fix the error in comments + q00251598 + 2014-11-25 04:01:56 -0800 + Commit: a51118a, github.com/apache/spark/pull/3400 + + [SPARK-4526][MLLIB]GradientDescent get a wrong gradient value according to the gradient formula. + GuoQiang Li + 2014-11-25 02:01:19 -0800 + Commit: f515f94, github.com/apache/spark/pull/3399 + + [SPARK-4596][MLLib] Refactorize Normalizer to make code cleaner + DB Tsai + 2014-11-25 01:57:34 -0800 + Commit: 89f9122, github.com/apache/spark/pull/3446 + + [DOC][Build] Wrong cmd for build spark with apache hadoop 2.4.X and hive 12 + wangfei + 2014-11-24 22:32:39 -0800 + Commit: 0fe54cf, github.com/apache/spark/pull/3335 + + [SQL] Compute timeTaken correctly + w00228970 + 2014-11-24 21:17:24 -0800 + Commit: 723be60, github.com/apache/spark/pull/3423 + + [SPARK-4582][MLLIB] get raw vectors for further processing in Word2Vec + tkaessmann , tkaessmann + 2014-11-24 19:58:01 -0800 + Commit: 9ce2bf3, github.com/apache/spark/pull/3309 + + [SPARK-4525] Mesos should decline unused offers + Patrick Wendell , Jongyoul Lee + 2014-11-24 19:14:14 -0800 + Commit: f0afb62, github.com/apache/spark/pull/3436 + + Revert "[SPARK-4525] Mesos should decline unused offers" + Patrick Wendell + 2014-11-24 19:16:53 -0800 + Commit: a68d442 + + [SPARK-4525] Mesos should decline unused offers + Patrick Wendell , Jongyoul Lee + 2014-11-24 19:14:14 -0800 + Commit: b043c27, github.com/apache/spark/pull/3436 + + [SPARK-4266] [Web-UI] Reduce stage page load time. + Kay Ousterhout + 2014-11-24 18:03:10 -0800 + Commit: d24d5bf, github.com/apache/spark/pull/3328 + + [SPARK-4548] []SPARK-4517] improve performance of python broadcast + Davies Liu + 2014-11-24 17:17:03 -0800 + Commit: 6cf5076, github.com/apache/spark/pull/3417 + + [SPARK-4578] fix asDict() with nested Row() + Davies Liu + 2014-11-24 16:41:23 -0800 + Commit: 050616b, github.com/apache/spark/pull/3434 + + [SPARK-4562] [MLlib] speedup vector + Davies Liu + 2014-11-24 16:37:14 -0800 + Commit: b660de7, github.com/apache/spark/pull/3420 + + [SPARK-4518][SPARK-4519][Streaming] Refactored file stream to prevent files from being processed multiple times + Tathagata Das + 2014-11-24 13:50:20 -0800 + Commit: cb0e9b0, github.com/apache/spark/pull/3419 + + [SPARK-4145] Web UI job pages + Josh Rosen + 2014-11-24 13:18:14 -0800 + Commit: 4a90276, github.com/apache/spark/pull/3009 + + [SPARK-4487][SQL] Fix attribute reference resolution error when using ORDER BY. + Kousuke Saruta + 2014-11-24 12:54:37 -0800 + Commit: dd1c9cb, github.com/apache/spark/pull/3363 + + [SQL] Fix path in HiveFromSpark + scwf + 2014-11-24 12:49:08 -0800 + Commit: b384119, github.com/apache/spark/pull/3415 + + [SQL] Fix comment in HiveShim + Daniel Darabos + 2014-11-24 12:45:07 -0800 + Commit: d5834f0, github.com/apache/spark/pull/3432 + + [SPARK-4479][SQL] Avoids unnecessary defensive copies when sort based shuffle is on + Cheng Lian + 2014-11-24 12:43:45 -0800 + Commit: a6d7b61, github.com/apache/spark/pull/3422 + + SPARK-4457. Document how to build for Hadoop versions greater than 2.4 + Sandy Ryza + 2014-11-24 13:28:48 -0600 + Commit: 29372b6, github.com/apache/spark/pull/3322 + + [SPARK-4377] Fixed serialization issue by switching to akka provided serializer. + Prashant Sharma + 2014-11-22 14:05:38 -0800 + Commit: 9b2a3c6, github.com/apache/spark/pull/3402 + + [SPARK-4431][MLlib] Implement efficient foreachActive for dense and sparse vector + DB Tsai + 2014-11-21 18:15:07 -0800 + Commit: b5d17ef, github.com/apache/spark/pull/3288 + + [SPARK-4531] [MLlib] cache serialized java object + Davies Liu + 2014-11-21 15:02:31 -0800 + Commit: ce95bd8, github.com/apache/spark/pull/3397 + + SPARK-4532: Fix bug in detection of Hive in Spark 1.2 + Patrick Wendell + 2014-11-21 12:10:04 -0800 + Commit: a81918c, github.com/apache/spark/pull/3398 + + [SPARK-4397][Core] Reorganize 'implicit's to improve the API convenience + zsxwing + 2014-11-21 10:06:30 -0800 + Commit: 65b987c, github.com/apache/spark/pull/3262 + + [SPARK-4472][Shell] Print "Spark context available as sc." only when SparkContext is created... + zsxwing + 2014-11-21 00:42:43 -0800 + Commit: f1069b8, github.com/apache/spark/pull/3341 + + [Doc][GraphX] Remove unused png files. + Reynold Xin + 2014-11-21 00:30:58 -0800 + Commit: 28fdc6f + + [Doc][GraphX] Remove Motivation section and did some minor update. + Reynold Xin + 2014-11-21 00:29:02 -0800 + Commit: b97070e + + [SPARK-4522][SQL] Parse schema with missing metadata. + Michael Armbrust + 2014-11-20 20:34:43 -0800 + Commit: 90a6a46, github.com/apache/spark/pull/3392 + + add Sphinx as a dependency of building docs + Davies Liu + 2014-11-20 19:12:45 -0800 + Commit: 8cd6eea, github.com/apache/spark/pull/3388 + + [SPARK-4413][SQL] Parquet support through datasource API + Michael Armbrust + 2014-11-20 18:31:02 -0800 + Commit: 02ec058, github.com/apache/spark/pull/3269 + + [SPARK-4244] [SQL] Support Hive Generic UDFs with constant object inspector parameters + Cheng Hao + 2014-11-20 16:50:59 -0800 + Commit: 84d79ee, github.com/apache/spark/pull/3109 + + [SPARK-4477] [PySpark] remove numpy from RDDSampler + Davies Liu , Xiangrui Meng + 2014-11-20 16:40:25 -0800 + Commit: d39f2e9, github.com/apache/spark/pull/3351 + + [SQL] fix function description mistake + Jacky Li + 2014-11-20 15:48:36 -0800 + Commit: ad5f1f3, github.com/apache/spark/pull/3344 + + [SPARK-2918] [SQL] Support the CTAS in EXPLAIN command + Cheng Hao + 2014-11-20 15:46:00 -0800 + Commit: 6aa0fc9, github.com/apache/spark/pull/3357 + + [SPARK-4318][SQL] Fix empty sum distinct. + Takuya UESHIN + 2014-11-20 15:41:24 -0800 + Commit: 2c2e7a4, github.com/apache/spark/pull/3184 + + [SPARK-4513][SQL] Support relational operator '<=>' in Spark SQL + ravipesala + 2014-11-20 15:34:03 -0800 + Commit: 98e9419, github.com/apache/spark/pull/3387 + + [SPARK-4439] [MLlib] add python api for random forest + Davies Liu + 2014-11-20 15:31:28 -0800 + Commit: 1c53a5d, github.com/apache/spark/pull/3320 + + [SPARK-4228][SQL] SchemaRDD to JSON + Dan McClary + 2014-11-20 13:36:50 -0800 + Commit: b8e6886, github.com/apache/spark/pull/3213 + + [SPARK-3938][SQL] Names in-memory columnar RDD with corresponding table name + Cheng Lian + 2014-11-20 13:12:24 -0800 + Commit: abf2918, github.com/apache/spark/pull/3383 + + [SPARK-4486][MLLIB] Improve GradientBoosting APIs and doc + Xiangrui Meng + 2014-11-20 00:48:59 -0800 + Commit: 15cacc8, github.com/apache/spark/pull/3374 + + [SPARK-4446] [SPARK CORE] + Leolh + 2014-11-19 18:18:55 -0800 + Commit: e216ffa, github.com/apache/spark/pull/3306 + + [SPARK-4480] Avoid many small spills in external data structures + Andrew Or + 2014-11-19 18:07:27 -0800 + Commit: 0eb4a7f, github.com/apache/spark/pull/3353 + + [Spark-4484] Treat maxResultSize as unlimited when set to 0; improve error message + Nishkam Ravi , nravi , nishkamravi2 + 2014-11-19 17:23:42 -0800 + Commit: 73fedf5, github.com/apache/spark/pull/3360 + + [SPARK-4478] Keep totalRegisteredExecutors up-to-date + Akshat Aranya + 2014-11-19 17:20:20 -0800 + Commit: 9ccc53c, github.com/apache/spark/pull/3373 + + Updating GraphX programming guide and documentation + Joseph E. Gonzalez + 2014-11-19 16:53:33 -0800 + Commit: 377b068, github.com/apache/spark/pull/3359 + + [SPARK-4495] Fix memory leak in JobProgressListener + Josh Rosen + 2014-11-19 16:50:21 -0800 + Commit: 04d462f, github.com/apache/spark/pull/3372 + + [SPARK-4294][Streaming] UnionDStream stream should express the requirements in the same way as TransformedDStream + Yadong Qi + 2014-11-19 15:53:06 -0800 + Commit: c3002c4, github.com/apache/spark/pull/3152 + + [SPARK-4384] [PySpark] improve sort spilling + Davies Liu + 2014-11-19 15:45:37 -0800 + Commit: 73c8ea8, github.com/apache/spark/pull/3252 + + [SPARK-4429][BUILD] Build for Scala 2.11 using sbt fails. + Takuya UESHIN + 2014-11-19 14:40:21 -0800 + Commit: f9adda9, github.com/apache/spark/pull/3342 + + [DOC][PySpark][Streaming] Fix docstring for sphinx + Ken Takagiwa + 2014-11-19 14:23:18 -0800 + Commit: 9b7bbce, github.com/apache/spark/pull/3311 + + SPARK-3962 Marked scope as provided for external projects. + Prashant Sharma , Prashant Sharma + 2014-11-19 14:18:10 -0800 + Commit: 1c93841, github.com/apache/spark/pull/2959 + + [HOT FIX] MiMa tests are broken + Andrew Or + 2014-11-19 14:03:44 -0800 + Commit: 0df02ca, github.com/apache/spark/pull/3371 + + [SPARK-4481][Streaming][Doc] Fix the wrong description of updateFunc + zsxwing + 2014-11-19 13:17:15 -0800 + Commit: 3bf7cee, github.com/apache/spark/pull/3356 + + [SPARK-4482][Streaming] Disable ReceivedBlockTracker's write ahead log by default + Tathagata Das + 2014-11-19 13:06:48 -0800 + Commit: 22fc4e7, github.com/apache/spark/pull/3358 + + [SPARK-4470] Validate number of threads in local mode + Kenichi Maehashi + 2014-11-19 12:11:09 -0800 + Commit: eacc788, github.com/apache/spark/pull/3337 + + [SPARK-4467] fix elements read count for ExtrenalSorter + Tianshuo Deng + 2014-11-19 10:01:09 -0800 + Commit: d75579d, github.com/apache/spark/pull/3302 + + SPARK-4455 Exclude dependency on hbase-annotations module + tedyu + 2014-11-19 00:55:39 -0800 + Commit: 5f5ac2d, github.com/apache/spark/pull/3286 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-11-19 00:27:31 -0800 + Commit: 8327df6, github.com/apache/spark/pull/2777 + + [Spark-4432]close InStream after the block is accessed + Mingfei + 2014-11-18 22:17:06 -0800 + Commit: 165cec9, github.com/apache/spark/pull/3290 + + [SPARK-4441] Close Tachyon client when TachyonBlockManager is shutdown + Mingfei + 2014-11-18 22:16:36 -0800 + Commit: 67e9876, github.com/apache/spark/pull/3299 + + Bumping version to 1.3.0-SNAPSHOT. + Marcelo Vanzin + 2014-11-18 21:24:18 -0800 + Commit: 397d3aa, github.com/apache/spark/pull/3277 + + [SPARK-4468][SQL] Fixes Parquet filter creation for inequality predicates with literals on the left hand side + Cheng Lian + 2014-11-18 17:41:54 -0800 + Commit: 423baea, github.com/apache/spark/pull/3334 + + [SPARK-4327] [PySpark] Python API for RDD.randomSplit() + Davies Liu + 2014-11-18 16:37:35 -0800 + Commit: 7f22fa8, github.com/apache/spark/pull/3193 + + [SPARK-4433] fix a racing condition in zipWithIndex + Xiangrui Meng + 2014-11-18 16:25:44 -0800 + Commit: bb46046, github.com/apache/spark/pull/3291 + + [SPARK-3721] [PySpark] broadcast objects larger than 2G + Davies Liu , Davies Liu + 2014-11-18 16:17:51 -0800 + Commit: 4a377af, github.com/apache/spark/pull/2659 + + [SPARK-4306] [MLlib] Python API for LogisticRegressionWithLBFGS + Davies Liu + 2014-11-18 15:57:33 -0800 + Commit: d2e2951, github.com/apache/spark/pull/3307 + + [SPARK-4463] Add (de)select all button for add'l metrics. + Kay Ousterhout + 2014-11-18 15:01:06 -0800 + Commit: 010bc86, github.com/apache/spark/pull/3331 + + [SPARK-4017] show progress bar in console + Davies Liu + 2014-11-18 13:37:21 -0800 + Commit: e34f38f, github.com/apache/spark/pull/3029 + + [SPARK-4404] remove sys.exit() in shutdown hook + Davies Liu + 2014-11-18 13:11:38 -0800 + Commit: 80f3177, github.com/apache/spark/pull/3289 + + [SPARK-4075][SPARK-4434] Fix the URI validation logic for Application Jar name. + Kousuke Saruta + 2014-11-18 12:17:33 -0800 + Commit: bfebfd8, github.com/apache/spark/pull/3326 + + [SQL] Support partitioned parquet tables that have the key in both the directory and the file + Michael Armbrust + 2014-11-18 12:13:23 -0800 + Commit: 90d72ec, github.com/apache/spark/pull/3272 + + [SPARK-4396] allow lookup by index in Python's Rating + Xiangrui Meng + 2014-11-18 10:35:29 -0800 + Commit: b54c6ab, github.com/apache/spark/pull/3261 + + [SPARK-4435] [MLlib] [PySpark] improve classification + Davies Liu + 2014-11-18 10:11:13 -0800 + Commit: 8fbf72b, github.com/apache/spark/pull/3305 + + ALS implicit: added missing parameter alpha in doc string + Felix Maximilian Möller + 2014-11-18 10:08:24 -0800 + Commit: cedc3b5, github.com/apache/spark/pull/3343 + + SPARK-4466: Provide support for publishing Scala 2.11 artifacts to Maven + Patrick Wendell + 2014-11-17 21:07:50 -0800 + Commit: c6e0c2a, github.com/apache/spark/pull/3332 + + [SPARK-4453][SPARK-4213][SQL] Simplifies Parquet filter generation code + Cheng Lian + 2014-11-17 16:55:12 -0800 + Commit: 36b0956, github.com/apache/spark/pull/3317 + + [SPARK-4448] [SQL] unwrap for the ConstantObjectInspector + Cheng Hao + 2014-11-17 16:35:49 -0800 + Commit: ef7c464, github.com/apache/spark/pull/3308 + + [SPARK-4443][SQL] Fix statistics for external table in spark sql hive + w00228970 + 2014-11-17 16:33:50 -0800 + Commit: 42389b1, github.com/apache/spark/pull/3304 + + [SPARK-4309][SPARK-4407][SQL] Date type support for Thrift server, and fixes for complex types + Cheng Lian + 2014-11-17 16:31:05 -0800 + Commit: 6b7f2f7, github.com/apache/spark/pull/3298 + + [SQL] Construct the MutableRow from an Array + Cheng Hao + 2014-11-17 16:29:52 -0800 + Commit: 69e858c, github.com/apache/spark/pull/3217 + + [SPARK-4425][SQL] Handle NaN or Infinity cast to Timestamp correctly. + Takuya UESHIN + 2014-11-17 16:28:07 -0800 + Commit: 566c791, github.com/apache/spark/pull/3283 + + [SPARK-4420][SQL] Change nullability of Cast from DoubleType/FloatType to DecimalType. + Takuya UESHIN + 2014-11-17 16:26:48 -0800 + Commit: 3a81a1c, github.com/apache/spark/pull/3278 + + [SQL] Makes conjunction pushdown more aggressive for in-memory table + Cheng Lian + 2014-11-17 15:33:13 -0800 + Commit: 5ce7dae, github.com/apache/spark/pull/3318 + + [SPARK-4180] [Core] Prevent creation of multiple active SparkContexts + Josh Rosen + 2014-11-17 12:48:18 -0800 + Commit: 0f3ceb5, github.com/apache/spark/pull/3121 + + [DOCS][SQL] Fix broken link to Row class scaladoc + Andy Konwinski + 2014-11-17 11:52:23 -0800 + Commit: cec1116, github.com/apache/spark/pull/3323 + + Revert "[SPARK-4075] [Deploy] Jar url validation is not enough for Jar file" + Andrew Or + 2014-11-17 11:24:28 -0800 + Commit: dbb9da5 + + [SPARK-4444] Drop VD type parameter from EdgeRDD + Ankur Dave + 2014-11-17 11:06:31 -0800 + Commit: 9ac2bb1, github.com/apache/spark/pull/3303 + + SPARK-2811 upgrade algebird to 0.8.1 + Adam Pingel + 2014-11-17 10:47:29 -0800 + Commit: e7690ed, github.com/apache/spark/pull/3282 + + SPARK-4445, Don't display storage level in toDebugString unless RDD is persisted. + Prashant Sharma + 2014-11-17 10:40:33 -0800 + Commit: 5c92d47, github.com/apache/spark/pull/3310 + + [SPARK-4410][SQL] Add support for external sort + Michael Armbrust + 2014-11-16 21:55:57 -0800 + Commit: 64c6b9b, github.com/apache/spark/pull/3268 + + [SPARK-4422][MLLIB]In some cases, Vectors.fromBreeze get wrong results. + GuoQiang Li + 2014-11-16 21:31:51 -0800 + Commit: 5168c6c, github.com/apache/spark/pull/3281 + + Revert "[SPARK-4309][SPARK-4407][SQL] Date type support for Thrift server, and fixes for complex types" + Michael Armbrust + 2014-11-16 15:05:04 -0800 + Commit: 45ce327, github.com/apache/spark/pull/3292 + + [SPARK-4309][SPARK-4407][SQL] Date type support for Thrift server, and fixes for complex types + Cheng Lian + 2014-11-16 14:26:41 -0800 + Commit: cb6bd83, github.com/apache/spark/pull/3178 + + [SPARK-4393] Fix memory leak in ConnectionManager ACK timeout TimerTasks; use HashedWheelTimer + Josh Rosen + 2014-11-16 00:44:15 -0800 + Commit: 7850e0c, github.com/apache/spark/pull/3259 + + [SPARK-4426][SQL][Minor] The symbol of BitwiseOr is wrong, should not be '&' + Kousuke Saruta + 2014-11-15 22:23:47 -0800 + Commit: 84468b2, github.com/apache/spark/pull/3284 + + [SPARK-4419] Upgrade snappy-java to 1.1.1.6 + Josh Rosen + 2014-11-15 22:22:34 -0800 + Commit: 7d8e152, github.com/apache/spark/pull/3287 + + [SPARK-2321] Several progress API improvements / refactorings + Josh Rosen + 2014-11-14 23:46:25 -0800 + Commit: 40eb8b6, github.com/apache/spark/pull/3197 + + Added contains(key) to Metadata + kai + 2014-11-14 23:44:23 -0800 + Commit: cbddac2, github.com/apache/spark/pull/3273 + + [SPARK-4260] Httpbroadcast should set connection timeout. + Kousuke Saruta + 2014-11-14 22:36:56 -0800 + Commit: 60969b0, github.com/apache/spark/pull/3122 + + [SPARK-4363][Doc] Update the Broadcast example + zsxwing + 2014-11-14 22:28:48 -0800 + Commit: 861223e, github.com/apache/spark/pull/3226 + + [SPARK-4379][Core] Change Exception to SparkException in checkpoint + zsxwing + 2014-11-14 22:25:41 -0800 + Commit: dba1405, github.com/apache/spark/pull/3241 + + [SPARK-4415] [PySpark] JVM should exit after Python exit + Davies Liu + 2014-11-14 20:13:46 -0800 + Commit: 7fe08b4, github.com/apache/spark/pull/3274 + + [SPARK-4404]SparkSubmitDriverBootstrapper should stop after its SparkSubmit sub-proc... + WangTao , WangTaoTheTonic + 2014-11-14 20:11:51 -0800 + Commit: 303a4e4, github.com/apache/spark/pull/3266 + + SPARK-4214. With dynamic allocation, avoid outstanding requests for more... + Sandy Ryza + 2014-11-14 15:51:05 -0800 + Commit: ad42b28, github.com/apache/spark/pull/3204 + + [SPARK-4412][SQL] Fix Spark's control of Parquet logging. + Jim Carroll + 2014-11-14 15:33:21 -0800 + Commit: 37482ce, github.com/apache/spark/pull/3271 + + [SPARK-4365][SQL] Remove unnecessary filter call on records returned from parquet library + Yash Datta + 2014-11-14 15:16:36 -0800 + Commit: 63ca3af, github.com/apache/spark/pull/3229 + + [SPARK-4386] Improve performance when writing Parquet files. + Jim Carroll + 2014-11-14 15:11:53 -0800 + Commit: f76b968, github.com/apache/spark/pull/3254 + + [SPARK-4322][SQL] Enables struct fields as sub expressions of grouping fields + Cheng Lian + 2014-11-14 15:09:36 -0800 + Commit: 0c7b66b, github.com/apache/spark/pull/3248 + + [SQL] Don't shuffle code generated rows + Michael Armbrust + 2014-11-14 15:03:23 -0800 + Commit: 4b4b50c, github.com/apache/spark/pull/3263 + + [SQL] Minor cleanup of comments, errors and override. + Michael Armbrust + 2014-11-14 15:00:42 -0800 + Commit: f805025, github.com/apache/spark/pull/3257 + + [SPARK-4391][SQL] Configure parquet filters using SQLConf + Michael Armbrust + 2014-11-14 14:59:35 -0800 + Commit: e47c387, github.com/apache/spark/pull/3258 + + [SPARK-4390][SQL] Handle NaN cast to decimal correctly + Michael Armbrust + 2014-11-14 14:56:57 -0800 + Commit: a0300ea, github.com/apache/spark/pull/3256 + + [SPARK-4062][Streaming]Add ReliableKafkaReceiver in Spark Streaming Kafka connector + jerryshao , Tathagata Das , Saisai Shao + 2014-11-14 14:33:37 -0800 + Commit: 5930f64, github.com/apache/spark/pull/2991 + + [SPARK-4333][SQL] Correctly log number of iterations in RuleExecutor + DoingDone9 <799203320@qq.com> + 2014-11-14 14:28:06 -0800 + Commit: 0cbdb01, github.com/apache/spark/pull/3180 + + SPARK-4375. no longer require -Pscala-2.10 + Sandy Ryza + 2014-11-14 14:21:57 -0800 + Commit: f5f757e, github.com/apache/spark/pull/3239 + + [SPARK-4245][SQL] Fix containsNull of the result ArrayType of CreateArray expression. + Takuya UESHIN + 2014-11-14 14:21:16 -0800 + Commit: bbd8f5b, github.com/apache/spark/pull/3110 + + [SPARK-4239] [SQL] support view in HiveQl + Daoyuan Wang + 2014-11-14 13:51:20 -0800 + Commit: ade72c4, github.com/apache/spark/pull/3131 + + Update failed assert text to match code in SizeEstimatorSuite + Jeff Hammerbacher + 2014-11-14 13:37:48 -0800 + Commit: c258db9, github.com/apache/spark/pull/3242 + + [SPARK-4313][WebUI][Yarn] Fix link issue of the executor thread dump page in yarn-cluster mode + zsxwing + 2014-11-14 13:36:13 -0800 + Commit: 156cf33, github.com/apache/spark/pull/3183 + + SPARK-3663 Document SPARK_LOG_DIR and SPARK_PID_DIR + Andrew Ash + 2014-11-14 13:33:35 -0800 + Commit: 5c265cc, github.com/apache/spark/pull/2518 + + [Spark Core] SPARK-4380 Edit spilling log from MB to B + Hong Shen + 2014-11-14 13:29:41 -0800 + Commit: 0c56a03, github.com/apache/spark/pull/3243 + + [SPARK-4398][PySpark] specialize sc.parallelize(xrange) + Xiangrui Meng + 2014-11-14 12:43:17 -0800 + Commit: abd5817, github.com/apache/spark/pull/3264 + + [SPARK-4394][SQL] Data Sources API Improvements + Michael Armbrust + 2014-11-14 12:00:08 -0800 + Commit: 77e845c, github.com/apache/spark/pull/3260 + + [SPARK-3722][Docs]minor improvement and fix in docs + WangTao + 2014-11-14 08:09:42 -0600 + Commit: e421072, github.com/apache/spark/pull/2579 + + [SPARK-4310][WebUI] Sort 'Submitted' column in Stage page by time + zsxwing + 2014-11-13 14:37:04 -0800 + Commit: 825709a, github.com/apache/spark/pull/3179 + + [SPARK-4372][MLLIB] Make LR and SVM's default parameters consistent in Scala and Python + Xiangrui Meng + 2014-11-13 13:54:16 -0800 + Commit: 3221830, github.com/apache/spark/pull/3232 + + [SPARK-4326] fix unidoc + Xiangrui Meng + 2014-11-13 13:16:20 -0800 + Commit: 4b0c1ed, github.com/apache/spark/pull/3253 + + [HOT FIX] make-distribution.sh fails if Yarn shuffle jar DNE + Andrew Or + 2014-11-13 11:54:45 -0800 + Commit: a0fa1ba, github.com/apache/spark/pull/3250 + + [SPARK-4378][MLLIB] make ALS more Java-friendly + Xiangrui Meng + 2014-11-13 11:42:27 -0800 + Commit: ca26a21, github.com/apache/spark/pull/3240 + + [SPARK-4348] [PySpark] [MLlib] rename random.py to rand.py + Davies Liu + 2014-11-13 10:24:54 -0800 + Commit: ce0333f, github.com/apache/spark/pull/3216 + + [SPARK-4256] Make Binary Evaluation Metrics functions defined in cases where there ar... + Andrew Bullen + 2014-11-12 22:14:44 -0800 + Commit: 484fecb, github.com/apache/spark/pull/3118 + + [SPARK-4370] [Core] Limit number of Netty cores based on executor size + Aaron Davidson + 2014-11-12 18:46:37 -0800 + Commit: b9e1c2e, github.com/apache/spark/pull/3155 + + [SPARK-4373][MLLIB] fix MLlib maven tests + Xiangrui Meng + 2014-11-12 18:15:14 -0800 + Commit: 23f5bdf, github.com/apache/spark/pull/3235 + + [Release] Bring audit scripts up-to-date + Andrew Or + 2014-11-13 00:30:58 +0000 + Commit: 723a86b + + [SPARK-2672] support compressed file in wholeTextFile + Davies Liu + 2014-11-12 15:58:12 -0800 + Commit: d7d54a4, github.com/apache/spark/pull/3005 + + [SPARK-4369] [MLLib] fix TreeModel.predict() with RDD + Davies Liu + 2014-11-12 13:56:41 -0800 + Commit: bd86118, github.com/apache/spark/pull/3230 + + [SPARK-3666] Extract interfaces for EdgeRDD and VertexRDD + Ankur Dave + 2014-11-12 13:49:20 -0800 + Commit: a5ef581, github.com/apache/spark/pull/2530 + + [Release] Correct make-distribution.sh log path + Andrew Or + 2014-11-12 13:46:26 -0800 + Commit: c3afd32 + + Internal cleanup for aggregateMessages + Ankur Dave + 2014-11-12 13:44:49 -0800 + Commit: 0402be9, github.com/apache/spark/pull/3231 + + [SPARK-4281][Build] Package Yarn shuffle service into its own jar + Andrew Or + 2014-11-12 13:39:45 -0800 + Commit: aa43a8d, github.com/apache/spark/pull/3147 + + [Test] Better exception message from SparkSubmitSuite + Andrew Or + 2014-11-12 13:35:48 -0800 + Commit: 6e3c5a2, github.com/apache/spark/pull/3212 + + [SPARK-3660][STREAMING] Initial RDD for updateStateByKey transformation + Soumitra Kumar + 2014-11-12 12:25:31 -0800 + Commit: 36ddeb7, github.com/apache/spark/pull/2665 + + [SPARK-3530][MLLIB] pipeline and parameters with examples + Xiangrui Meng + 2014-11-12 10:38:57 -0800 + Commit: 4b736db, github.com/apache/spark/pull/3099 + + [SPARK-4355][MLLIB] fix OnlineSummarizer.merge when other.mean is zero + Xiangrui Meng + 2014-11-12 01:50:11 -0800 + Commit: 84324fb, github.com/apache/spark/pull/3220 + + [SPARK-3936] Add aggregateMessages, which supersedes mapReduceTriplets + Ankur Dave + 2014-11-11 23:38:27 -0800 + Commit: faeb41d, github.com/apache/spark/pull/3100 + + [MLLIB] SPARK-4347: Reducing GradientBoostingSuite run time. + Manish Amde + 2014-11-11 22:47:53 -0800 + Commit: 2ef016b, github.com/apache/spark/pull/3214 + + Support cross building for Scala 2.11 + Prashant Sharma , Patrick Wendell + 2014-11-11 21:36:48 -0800 + Commit: daaca14, github.com/apache/spark/pull/3159 + + [Release] Log build output for each distribution + Andrew Or + 2014-11-11 18:02:59 -0800 + Commit: 2ddb141 + + SPARK-2269 Refactor mesos scheduler resourceOffers and add unit test + Timothy Chen + 2014-11-11 14:29:18 -0800 + Commit: a878660, github.com/apache/spark/pull/1487 + + [SPARK-4282][YARN] Stopping flag in YarnClientSchedulerBackend should be volatile + Kousuke Saruta + 2014-11-11 12:33:53 -0600 + Commit: 7f37188, github.com/apache/spark/pull/3143 + + SPARK-4305 [BUILD] yarn-alpha profile won't build due to network/yarn module + Sean Owen + 2014-11-11 12:30:35 -0600 + Commit: f820b56, github.com/apache/spark/pull/3167 + + SPARK-1830 Deploy failover, Make Persistence engine and LeaderAgent Pluggable + Prashant Sharma + 2014-11-11 09:29:48 -0800 + Commit: deefd9d, github.com/apache/spark/pull/771 + + [Streaming][Minor]Replace some 'if-else' in Clock + huangzhaowei + 2014-11-11 03:02:12 -0800 + Commit: 6e03de3, github.com/apache/spark/pull/3088 + + [SPARK-2492][Streaming] kafkaReceiver minor changes to align with Kafka 0.8 + jerryshao + 2014-11-11 02:22:23 -0800 + Commit: c8850a3, github.com/apache/spark/pull/1420 + + [SPARK-4295][External]Fix exception in SparkSinkSuite + maji2014 + 2014-11-11 02:18:27 -0800 + Commit: f8811a5, github.com/apache/spark/pull/3177 + + [SPARK-4307] Initialize FileDescriptor lazily in FileRegion. + Reynold Xin , Reynold Xin + 2014-11-11 00:25:31 -0800 + Commit: ef29a9a, github.com/apache/spark/pull/3172 + + [SPARK-4324] [PySpark] [MLlib] support numpy.array for all MLlib API + Davies Liu + 2014-11-10 22:26:16 -0800 + Commit: 65083e9, github.com/apache/spark/pull/3189 + + [SPARK-4330][Doc] Link to proper URL for YARN overview + Kousuke Saruta + 2014-11-10 22:18:00 -0800 + Commit: 3c07b8f, github.com/apache/spark/pull/3196 + + [SPARK-3649] Remove GraphX custom serializers + Ankur Dave + 2014-11-10 19:31:52 -0800 + Commit: 300887b, github.com/apache/spark/pull/2503 + + [SPARK-4274] [SQL] Fix NPE in printing the details of the query plan + Cheng Hao + 2014-11-10 17:46:05 -0800 + Commit: c764d0a, github.com/apache/spark/pull/3139 + + [SPARK-3954][Streaming] Optimization to FileInputDStream + surq + 2014-11-10 17:37:16 -0800 + Commit: ce6ed2a, github.com/apache/spark/pull/2811 + + [SPARK-4149][SQL] ISO 8601 support for json date time strings + Daoyuan Wang + 2014-11-10 17:26:03 -0800 + Commit: a1fc059, github.com/apache/spark/pull/3012 + + [SPARK-4250] [SQL] Fix bug of constant null value mapping to ConstantObjectInspector + Cheng Hao + 2014-11-10 17:22:57 -0800 + Commit: fa77783, github.com/apache/spark/pull/3114 + + [SQL] remove a decimal case branch that has no effect at runtime + Xiangrui Meng + 2014-11-10 17:20:52 -0800 + Commit: d793d80, github.com/apache/spark/pull/3192 + + [SPARK-4308][SQL] Sets SQL operation state to ERROR when exception is thrown + Cheng Lian + 2014-11-10 16:56:36 -0800 + Commit: acb55ae, github.com/apache/spark/pull/3175 + + [SPARK-4000][Build] Uploads HiveCompatibilitySuite logs + Cheng Lian + 2014-11-10 16:17:52 -0800 + Commit: 534b231, github.com/apache/spark/pull/2993 + + [SPARK-4319][SQL] Enable an ignored test "null count". + Takuya UESHIN + 2014-11-10 15:55:15 -0800 + Commit: dbf1058, github.com/apache/spark/pull/3185 + + Revert "[SPARK-2703][Core]Make Tachyon related unit tests execute without deploying a Tachyon system locally." + Patrick Wendell + 2014-11-10 14:56:06 -0800 + Commit: 6e7a309 + + [SPARK-4047] - Generate runtime warnings for example implementation of PageRank + Varadharajan Mukundan + 2014-11-10 14:32:29 -0800 + Commit: 974d334, github.com/apache/spark/pull/2894 + + SPARK-1297 Upgrade HBase dependency to 0.98 + tedyu + 2014-11-10 13:23:33 -0800 + Commit: b32734e, github.com/apache/spark/pull/3115 + + SPARK-4230. Doc for spark.default.parallelism is incorrect + Sandy Ryza + 2014-11-10 12:40:41 -0800 + Commit: c6f4e70, github.com/apache/spark/pull/3107 + + [SPARK-4312] bash doesn't have "die" + Jey Kottalam + 2014-11-10 12:37:56 -0800 + Commit: c5db8e2, github.com/apache/spark/pull/2898 + + Update RecoverableNetworkWordCount.scala + comcmipi + 2014-11-10 12:33:48 -0800 + Commit: 0340c56, github.com/apache/spark/pull/2735 + + SPARK-2548 [STREAMING] JavaRecoverableWordCount is missing + Sean Owen + 2014-11-10 11:47:27 -0800 + Commit: 3a02d41, github.com/apache/spark/pull/2564 + + [SPARK-4169] [Core] Accommodate non-English Locales in unit tests + Niklas Wilcke <1wilcke@informatik.uni-hamburg.de> + 2014-11-10 11:37:38 -0800 + Commit: ed8bf1e, github.com/apache/spark/pull/3036 + + [SQL] support udt to hive types conversion (hive->udt is not supported) + Xiangrui Meng + 2014-11-10 11:04:12 -0800 + Commit: 894a724, github.com/apache/spark/pull/3164 + + [SPARK-2703][Core]Make Tachyon related unit tests execute without deploying a Tachyon system locally. + RongGu + 2014-11-09 23:48:15 -0800 + Commit: bd86cb1, github.com/apache/spark/pull/3030 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-11-09 23:07:14 -0800 + Commit: 227488d, github.com/apache/spark/pull/2898 + + SPARK-3179. Add task OutputMetrics. + Sandy Ryza + 2014-11-09 22:29:03 -0800 + Commit: 3c2cff4, github.com/apache/spark/pull/2968 + + SPARK-1209 [CORE] (Take 2) SparkHadoop{MapRed,MapReduce}Util should not use package org.apache.hadoop + Sean Owen + 2014-11-09 22:11:20 -0800 + Commit: f8e5732, github.com/apache/spark/pull/3048 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-11-09 18:16:20 -0800 + Commit: f73b56f, github.com/apache/spark/pull/464 + + SPARK-1344 [DOCS] Scala API docs for top methods + Sean Owen + 2014-11-09 17:42:08 -0800 + Commit: d136265, github.com/apache/spark/pull/3168 + + SPARK-971 [DOCS] Link to Confluence wiki from project website / documentation + Sean Owen + 2014-11-09 17:40:48 -0800 + Commit: 8c99a47, github.com/apache/spark/pull/3169 + + [SPARK-4301] StreamingContext should not allow start() to be called after calling stop() + Josh Rosen + 2014-11-08 18:10:23 -0800 + Commit: 7b41b17, github.com/apache/spark/pull/3160 + + [Minor] [Core] Don't NPE on closeQuietly(null) + Aaron Davidson + 2014-11-08 13:03:51 -0800 + Commit: 4af5c7e, github.com/apache/spark/pull/3166 + + [SPARK-4291][Build] Rename network module projects + Andrew Or + 2014-11-07 23:16:13 -0800 + Commit: 7afc856, github.com/apache/spark/pull/3148 + + [MLLIB] [PYTHON] SPARK-4221: Expose nonnegative ALS in the python API + Michelangelo D'Agostino + 2014-11-07 22:53:01 -0800 + Commit: 7e9d975, github.com/apache/spark/pull/3095 + + [SPARK-4304] [PySpark] Fix sort on empty RDD + Davies Liu + 2014-11-07 20:53:03 -0800 + Commit: 7779109, github.com/apache/spark/pull/3162 + + MAINTENANCE: Automated closing of pull requests. + Patrick Wendell + 2014-11-07 13:08:25 -0800 + Commit: 5923dd9, github.com/apache/spark/pull/3016 + + Update JavaCustomReceiver.java + xiao321 <1042460381@qq.com> + 2014-11-07 12:56:49 -0800 + Commit: 7c9ec52, github.com/apache/spark/pull/3153 + + [SPARK-4292][SQL] Result set iterator bug in JDBC/ODBC + wangfei + 2014-11-07 12:55:11 -0800 + Commit: d6e5552, github.com/apache/spark/pull/3149 + + [SPARK-4203][SQL] Partition directories in random order when inserting into hive table + Matthew Taylor + 2014-11-07 12:53:08 -0800 + Commit: ac70c97, github.com/apache/spark/pull/3076 + + [SPARK-4270][SQL] Fix Cast from DateType to DecimalType. + Takuya UESHIN + 2014-11-07 12:30:47 -0800 + Commit: a6405c5, github.com/apache/spark/pull/3134 + + [SPARK-4272] [SQL] Add more unwrapper functions for primitive type in TableReader + Cheng Hao + 2014-11-07 12:15:53 -0800 + Commit: 60ab80f, github.com/apache/spark/pull/3136 + + [SPARK-4213][SQL] ParquetFilters - No support for LT, LTE, GT, GTE operators + Kousuke Saruta + 2014-11-07 11:56:40 -0800 + Commit: 14c54f1, github.com/apache/spark/pull/3083 + + [SQL] Modify keyword val location according to ordering + Jacky Li + 2014-11-07 11:52:08 -0800 + Commit: 68609c5, github.com/apache/spark/pull/3080 + + [SQL] Support ScalaReflection of schema in different universes + Michael Armbrust + 2014-11-07 11:51:20 -0800 + Commit: 8154ed7, github.com/apache/spark/pull/3096 + + [SPARK-4225][SQL] Resorts to SparkContext.version to inspect Spark version + Cheng Lian + 2014-11-07 11:45:25 -0800 + Commit: 86e9eaa, github.com/apache/spark/pull/3105 + + [SQL][DOC][Minor] Spark SQL Hive now support dynamic partitioning + wangfei + 2014-11-07 11:43:35 -0800 + Commit: 636d7bc, github.com/apache/spark/pull/3127 + + [SPARK-4187] [Core] Switch to binary protocol for external shuffle service messages + Aaron Davidson + 2014-11-07 09:42:21 -0800 + Commit: d4fa04e, github.com/apache/spark/pull/3146 + + [SPARK-4204][Core][WebUI] Change Utils.exceptionString to contain the inner exceptions and make the error information in Web UI more friendly + zsxwing + 2014-11-06 21:52:12 -0800 + Commit: 3abdb1b, github.com/apache/spark/pull/3073 + + [SPARK-4236] Cleanup removed applications' files in shuffle service + Aaron Davidson + 2014-11-06 19:54:32 -0800 + Commit: 48a19a6, github.com/apache/spark/pull/3126 + + [SPARK-4188] [Core] Perform network-level retry of shuffle file fetches + Aaron Davidson + 2014-11-06 18:39:14 -0800 + Commit: f165b2b, github.com/apache/spark/pull/3101 + + [SPARK-4277] Support external shuffle service on Standalone Worker + Aaron Davidson + 2014-11-06 17:20:46 -0800 + Commit: 6e9ef10, github.com/apache/spark/pull/3142 + + [SPARK-3797] Minor addendum to Yarn shuffle service + Andrew Or + 2014-11-06 17:18:49 -0800 + Commit: 96136f2, github.com/apache/spark/pull/3144 + + [HOT FIX] Make distribution fails + Andrew Or + 2014-11-06 15:31:07 -0800 + Commit: 470881b, github.com/apache/spark/pull/3145 + + [SPARK-4249][GraphX]fix a problem of EdgePartitionBuilder in Graphx + lianhuiwang + 2014-11-06 10:46:45 -0800 + Commit: d15c6e9, github.com/apache/spark/pull/3138 + + [SPARK-4264] Completion iterator should only invoke callback once + Aaron Davidson + 2014-11-06 10:45:46 -0800 + Commit: 23eaf0e, github.com/apache/spark/pull/3128 + + [SPARK-4186] add binaryFiles and binaryRecords in Python + Davies Liu + 2014-11-06 00:22:19 -0800 + Commit: b41a39e, github.com/apache/spark/pull/3078 + + [SPARK-4255] Fix incorrect table striping + Kay Ousterhout + 2014-11-06 00:03:03 -0800 + Commit: 5f27ae1, github.com/apache/spark/pull/3117 + + [SPARK-4137] [EC2] Don't change working dir on user + Nicholas Chammas + 2014-11-05 20:45:35 -0800 + Commit: db45f5a, github.com/apache/spark/pull/2988 + + [SPARK-4262][SQL] add .schemaRDD to JavaSchemaRDD + Xiangrui Meng + 2014-11-05 19:56:16 -0800 + Commit: 3d2b5bc, github.com/apache/spark/pull/3125 + + [SPARK-4254] [mllib] MovieLensALS bug fix + Joseph K. Bradley + 2014-11-05 19:51:18 -0800 + Commit: c315d13, github.com/apache/spark/pull/3116 + + [SPARK-4158] Fix for missing resources. + Brenden Matthews + 2014-11-05 16:02:44 -0800 + Commit: cb0eae3, github.com/apache/spark/pull/3024 + + SPARK-3223 runAsSparkUser cannot change HDFS write permission properly i... + Jongyoul Lee + 2014-11-05 15:49:42 -0800 + Commit: f7ac8c2, github.com/apache/spark/pull/3034 + + SPARK-4040. Update documentation to exemplify use of local (n) value, fo... + jay@apache.org + 2014-11-05 15:45:34 -0800 + Commit: 868cd4c, github.com/apache/spark/pull/2964 + + [SPARK-3797] Run external shuffle service in Yarn NM + Andrew Or + 2014-11-05 15:42:05 -0800 + Commit: 61a5cce, github.com/apache/spark/pull/3082 + + SPARK-4222 [CORE] use readFully in FixedLengthBinaryRecordReader + industrial-sloth + 2014-11-05 15:38:48 -0800 + Commit: f37817b, github.com/apache/spark/pull/3093 + + [SPARK-3984] [SPARK-3983] Fix incorrect scheduler delay and display task deserialization time in UI + Kay Ousterhout + 2014-11-05 15:30:31 -0800 + Commit: a46497e, github.com/apache/spark/pull/2832 + + [SPARK-4242] [Core] Add SASL to external shuffle service + Aaron Davidson + 2014-11-05 14:38:43 -0800 + Commit: 4c42986, github.com/apache/spark/pull/3108 + + [SPARK-4197] [mllib] GradientBoosting API cleanup and examples in Scala, Java + Joseph K. Bradley + 2014-11-05 10:33:13 -0800 + Commit: 5b3b6f6, github.com/apache/spark/pull/3094 + + [SPARK-4029][Streaming] Update streaming driver to reliably save and recover received block metadata on driver failures + Tathagata Das + 2014-11-05 01:21:53 -0800 + Commit: 5f13759, github.com/apache/spark/pull/3026 + + [SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API + Davies Liu + 2014-11-04 21:35:52 -0800 + Commit: c8abddc, github.com/apache/spark/pull/3091 + + [SQL] Add String option for DSL AS + Michael Armbrust + 2014-11-04 18:14:28 -0800 + Commit: 515abb9, github.com/apache/spark/pull/3097 + + [SPARK-2938] Support SASL authentication in NettyBlockTransferService + Aaron Davidson + 2014-11-04 16:15:38 -0800 + Commit: 5e73138, github.com/apache/spark/pull/3087 + + [Spark-4060] [MLlib] exposing special rdd functions to the public + Niklas Wilcke <1wilcke@informatik.uni-hamburg.de> + 2014-11-04 09:57:03 -0800 + Commit: f90ad5d, github.com/apache/spark/pull/2907 + + fixed MLlib Naive-Bayes java example bug + Dariusz Kobylarz + 2014-11-04 09:53:43 -0800 + Commit: bcecd73, github.com/apache/spark/pull/3081 + + [SPARK-3886] [PySpark] simplify serializer, use AutoBatchedSerializer by default. + Davies Liu + 2014-11-03 23:56:14 -0800 + Commit: e4f4263, github.com/apache/spark/pull/2920 + + [SPARK-4166][Core] Add a backward compatibility test for ExecutorLostFailure + zsxwing + 2014-11-03 22:47:45 -0800 + Commit: b671ce0, github.com/apache/spark/pull/3085 + + [SPARK-4163][Core] Add a backward compatibility test for FetchFailed + zsxwing + 2014-11-03 22:40:43 -0800 + Commit: 9bdc841, github.com/apache/spark/pull/3086 + + [SPARK-3573][MLLIB] Make MLlib's Vector compatible with SQL's SchemaRDD + Xiangrui Meng + 2014-11-03 22:29:48 -0800 + Commit: 1a9c6cd, github.com/apache/spark/pull/3070 + + [SPARK-4192][SQL] Internal API for Python UDT + Xiangrui Meng + 2014-11-03 19:29:11 -0800 + Commit: 04450d1, github.com/apache/spark/pull/3068 + + [FIX][MLLIB] fix seed in BaggedPointSuite + Xiangrui Meng + 2014-11-03 18:50:37 -0800 + Commit: c5912ec, github.com/apache/spark/pull/3084 + + [SPARK-611] Display executor thread dumps in web UI + Josh Rosen + 2014-11-03 18:18:47 -0800 + Commit: 4f035dd, github.com/apache/spark/pull/2944 + + [SPARK-4168][WebUI] web statges number should show correctly when stages are more than 1000 + Zhang, Liye + 2014-11-03 18:17:32 -0800 + Commit: 97a466e, github.com/apache/spark/pull/3035 + + [SQL] Convert arguments to Scala UDFs + Michael Armbrust + 2014-11-03 18:04:51 -0800 + Commit: 15b58a2, github.com/apache/spark/pull/3077 + + SPARK-4178. Hadoop input metrics ignore bytes read in RecordReader insta... + Sandy Ryza + 2014-11-03 15:19:01 -0800 + Commit: 2812815, github.com/apache/spark/pull/3045 + + [SQL] More aggressive defaults + Michael Armbrust + 2014-11-03 14:08:27 -0800 + Commit: 25bef7e, github.com/apache/spark/pull/3064 + + [SPARK-4152] [SQL] Avoid data change in CTAS while table already existed + Cheng Hao + 2014-11-03 13:59:43 -0800 + Commit: e83f13e, github.com/apache/spark/pull/3013 + + [SPARK-4202][SQL] Simple DSL support for Scala UDF + Cheng Lian + 2014-11-03 13:20:33 -0800 + Commit: c238fb4, github.com/apache/spark/pull/3067 + + [SPARK-3594] [PySpark] [SQL] take more rows to infer schema or sampling + Davies Liu , Davies Liu + 2014-11-03 13:17:09 -0800 + Commit: 24544fb, github.com/apache/spark/pull/2716 + + [SPARK-4207][SQL] Query which has syntax like 'not like' is not working in Spark SQL + ravipesala + 2014-11-03 13:07:41 -0800 + Commit: 2b6e1ce, github.com/apache/spark/pull/3075 + + [SPARK-4211][Build] Fixes hive.version in Maven profile hive-0.13.1 + fi + 2014-11-03 12:56:56 -0800 + Commit: df607da, github.com/apache/spark/pull/3072 + + [SPARK-4148][PySpark] fix seed distribution and add some tests for rdd.sample + Xiangrui Meng + 2014-11-03 12:24:24 -0800 + Commit: 3cca196, github.com/apache/spark/pull/3010 + + [EC2] Factor out Mesos spark-ec2 branch + Nicholas Chammas + 2014-11-03 09:02:35 -0800 + Commit: 2aca97c, github.com/apache/spark/pull/3008 + diff --git a/R/create-docs.sh b/R/create-docs.sh index 6a4687b06ecb..d2ae160b5002 100755 --- a/R/create-docs.sh +++ b/R/create-docs.sh @@ -39,7 +39,7 @@ pushd $FWDIR mkdir -p pkg/html pushd pkg/html -Rscript -e 'library(SparkR, lib.loc="../../lib"); library(knitr); knit_rd("SparkR")' +Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep="/")))' popd diff --git a/R/install-dev.bat b/R/install-dev.bat index f32670b67de9..008a5c668bc4 100644 --- a/R/install-dev.bat +++ b/R/install-dev.bat @@ -25,8 +25,3 @@ set SPARK_HOME=%~dp0.. MKDIR %SPARK_HOME%\R\lib R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" %SPARK_HOME%\R\pkg\ - -rem Zip the SparkR package so that it can be distributed to worker nodes on YARN -pushd %SPARK_HOME%\R\lib -%JAVA_HOME%\bin\jar.exe cfM "%SPARK_HOME%\R\lib\sparkr.zip" SparkR -popd diff --git a/R/install-dev.sh b/R/install-dev.sh index 4972bb921707..59d98c9c7a64 100755 --- a/R/install-dev.sh +++ b/R/install-dev.sh @@ -42,8 +42,4 @@ Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtoo # Install SparkR to $LIB_DIR R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/ -# Zip the SparkR package so that it can be distributed to worker nodes on YARN -cd $LIB_DIR -jar cfM "$LIB_DIR/sparkr.zip" SparkR - popd > /dev/null diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 4949d86d20c9..d0d7201f004a 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,7 +1,7 @@ Package: SparkR Type: Package Title: R frontend for Spark -Version: 1.4.0 +Version: 1.5.0 Date: 2013-09-09 Author: The Apache Software Foundation Maintainer: Shivaram Venkataraman @@ -29,6 +29,7 @@ Collate: 'client.R' 'context.R' 'deserialize.R' + 'functions.R' 'mllib.R' 'serialize.R' 'sparkR.R' diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index b2d92bdf4840..9d3963070643 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -47,12 +47,12 @@ exportMethods("arrange", "join", "limit", "merge", + "mutate", + "na.omit", "names", "ncol", "nrow", "orderBy", - "mutate", - "names", "persist", "printSchema", "rbind", @@ -69,9 +69,11 @@ exportMethods("arrange", "selectExpr", "show", "showDF", + "subset", "summarize", "summary", "take", + "transform", "unionAll", "unique", "unpersist", @@ -82,59 +84,139 @@ exportMethods("arrange", exportClasses("Column") -exportMethods("abs", +exportMethods("%in%", + "abs", "acos", + "add_months", "alias", "approxCountDistinct", "asc", + "ascii", "asin", "atan", "atan2", "avg", + "base64", "between", + "bin", + "bitwiseNOT", "cast", "cbrt", + "ceil", "ceiling", + "concat", + "concat_ws", "contains", + "conv", "cos", "cosh", + "count", "countDistinct", + "crc32", + "date_add", + "date_format", + "date_sub", + "datediff", + "dayofmonth", + "dayofyear", "desc", "endsWith", "exp", + "explode", "expm1", + "expr", + "factorial", + "first", "floor", + "format_number", + "format_string", + "from_unixtime", + "from_utc_timestamp", "getField", "getItem", + "greatest", + "hex", + "hour", "hypot", + "ifelse", + "initcap", + "instr", + "isNaN", "isNotNull", "isNull", "last", + "last_day", + "least", + "length", + "levenshtein", "like", + "lit", + "locate", "log", "log10", "log1p", + "log2", "lower", + "lpad", + "ltrim", "max", + "md5", "mean", "min", + "minute", + "month", + "months_between", "n", "n_distinct", + "nanvl", + "negate", + "next_day", + "otherwise", + "pmod", + "quarter", + "rand", + "randn", + "regexp_extract", + "regexp_replace", + "reverse", "rint", "rlike", + "round", + "rpad", + "rtrim", + "second", + "sha1", + "sha2", + "shiftLeft", + "shiftRight", + "shiftRightUnsigned", "sign", + "signum", "sin", "sinh", + "size", + "soundex", "sqrt", "startsWith", "substr", + "substring_index", "sum", "sumDistinct", "tan", "tanh", "toDegrees", "toRadians", - "upper") + "to_date", + "to_utc_timestamp", + "translate", + "trim", + "unbase64", + "unhex", + "unix_timestamp", + "upper", + "weekofyear", + "when", + "year") exportClasses("GroupedData") exportMethods("agg") diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 895603235011..f833e70a0f06 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -27,9 +27,10 @@ setOldClass("jobj") #' \code{jsonFile}, \code{table} etc. #' @rdname DataFrame #' @seealso jsonFile, table +#' @docType class #' -#' @param env An R environment that stores bookkeeping states of the DataFrame -#' @param sdf A Java object reference to the backing Scala DataFrame +#' @slot env An R environment that stores bookkeeping states of the DataFrame +#' @slot sdf A Java object reference to the backing Scala DataFrame #' @export setClass("DataFrame", slots = list(env = "environment", @@ -61,6 +62,7 @@ dataFrame <- function(sdf, isCached = FALSE) { #' @param x A SparkSQL DataFrame #' #' @rdname printSchema +#' @name printSchema #' @export #' @examples #'\dontrun{ @@ -84,6 +86,7 @@ setMethod("printSchema", #' @param x A SparkSQL DataFrame #' #' @rdname schema +#' @name schema #' @export #' @examples #'\dontrun{ @@ -106,6 +109,7 @@ setMethod("schema", #' @param x A SparkSQL DataFrame #' @param extended Logical. If extended is False, explain() only prints the physical plan. #' @rdname explain +#' @name explain #' @export #' @examples #'\dontrun{ @@ -135,6 +139,7 @@ setMethod("explain", #' @param x A SparkSQL DataFrame #' #' @rdname isLocal +#' @name isLocal #' @export #' @examples #'\dontrun{ @@ -158,6 +163,7 @@ setMethod("isLocal", #' @param numRows The number of rows to print. Defaults to 20. #' #' @rdname showDF +#' @name showDF #' @export #' @examples #'\dontrun{ @@ -181,6 +187,7 @@ setMethod("showDF", #' @param x A SparkSQL DataFrame #' #' @rdname show +#' @name show #' @export #' @examples #'\dontrun{ @@ -206,6 +213,7 @@ setMethod("show", "DataFrame", #' @param x A SparkSQL DataFrame #' #' @rdname dtypes +#' @name dtypes #' @export #' @examples #'\dontrun{ @@ -230,6 +238,8 @@ setMethod("dtypes", #' @param x A SparkSQL DataFrame #' #' @rdname columns +#' @name columns +#' @aliases names #' @export #' @examples #'\dontrun{ @@ -248,7 +258,7 @@ setMethod("columns", }) #' @rdname columns -#' @aliases names,DataFrame,function-method +#' @name names setMethod("names", signature(x = "DataFrame"), function(x) { @@ -256,6 +266,7 @@ setMethod("names", }) #' @rdname columns +#' @name names<- setMethod("names<-", signature(x = "DataFrame"), function(x, value) { @@ -273,6 +284,7 @@ setMethod("names<-", #' @param tableName A character vector containing the name of the table #' #' @rdname registerTempTable +#' @name registerTempTable #' @export #' @examples #'\dontrun{ @@ -299,6 +311,7 @@ setMethod("registerTempTable", #' the existing rows in the table. #' #' @rdname insertInto +#' @name insertInto #' @export #' @examples #'\dontrun{ @@ -321,7 +334,8 @@ setMethod("insertInto", #' #' @param x A SparkSQL DataFrame #' -#' @rdname cache-methods +#' @rdname cache +#' @name cache #' @export #' @examples #'\dontrun{ @@ -347,6 +361,7 @@ setMethod("cache", #' #' @param x The DataFrame to persist #' @rdname persist +#' @name persist #' @export #' @examples #'\dontrun{ @@ -372,6 +387,7 @@ setMethod("persist", #' @param x The DataFrame to unpersist #' @param blocking Whether to block until all blocks are deleted #' @rdname unpersist-methods +#' @name unpersist #' @export #' @examples #'\dontrun{ @@ -397,6 +413,7 @@ setMethod("unpersist", #' @param x A SparkSQL DataFrame #' @param numPartitions The number of partitions to use. #' @rdname repartition +#' @name repartition #' @export #' @examples #'\dontrun{ @@ -446,6 +463,7 @@ setMethod("toJSON", #' @param x A SparkSQL DataFrame #' @param path The directory where the file is saved #' @rdname saveAsParquetFile +#' @name saveAsParquetFile #' @export #' @examples #'\dontrun{ @@ -467,6 +485,7 @@ setMethod("saveAsParquetFile", #' #' @param x A SparkSQL DataFrame #' @rdname distinct +#' @name distinct #' @export #' @examples #'\dontrun{ @@ -488,7 +507,8 @@ setMethod("distinct", #' @description Returns a new DataFrame containing distinct rows in this DataFrame #' #' @rdname unique -#' @aliases unique +#' @name unique +#' @aliases distinct setMethod("unique", signature(x = "DataFrame"), function(x) { @@ -526,7 +546,7 @@ setMethod("sample", }) #' @rdname sample -#' @aliases sample +#' @name sample_frac setMethod("sample_frac", signature(x = "DataFrame", withReplacement = "logical", fraction = "numeric"), @@ -541,6 +561,8 @@ setMethod("sample_frac", #' @param x A SparkSQL DataFrame #' #' @rdname count +#' @name count +#' @aliases nrow #' @export #' @examples #'\dontrun{ @@ -574,6 +596,7 @@ setMethod("nrow", #' @param x a SparkSQL DataFrame #' #' @rdname ncol +#' @name ncol #' @export #' @examples #'\dontrun{ @@ -593,6 +616,7 @@ setMethod("ncol", #' @param x a SparkSQL DataFrame #' #' @rdname dim +#' @name dim #' @export #' @examples #'\dontrun{ @@ -613,8 +637,8 @@ setMethod("dim", #' @param x A SparkSQL DataFrame #' @param stringsAsFactors (Optional) A logical indicating whether or not string columns #' should be converted to factors. FALSE by default. - -#' @rdname collect-methods +#' @rdname collect +#' @name collect #' @export #' @examples #'\dontrun{ @@ -650,6 +674,7 @@ setMethod("collect", #' @return A new DataFrame containing the number of rows specified. #' #' @rdname limit +#' @name limit #' @export #' @examples #' \dontrun{ @@ -669,6 +694,7 @@ setMethod("limit", #' Take the first NUM rows of a DataFrame and return a the results as a data.frame #' #' @rdname take +#' @name take #' @export #' @examples #'\dontrun{ @@ -696,6 +722,7 @@ setMethod("take", #' @return A data.frame #' #' @rdname head +#' @name head #' @export #' @examples #'\dontrun{ @@ -717,6 +744,7 @@ setMethod("head", #' @param x A SparkSQL DataFrame #' #' @rdname first +#' @name first #' @export #' @examples #'\dontrun{ @@ -732,7 +760,7 @@ setMethod("first", take(x, 1) }) -# toRDD() +# toRDD # # Converts a Spark DataFrame to an RDD while preserving column names. # @@ -769,6 +797,7 @@ setMethod("toRDD", #' @seealso GroupedData #' @aliases group_by #' @rdname groupBy +#' @name groupBy #' @export #' @examples #' \dontrun{ @@ -792,7 +821,7 @@ setMethod("groupBy", }) #' @rdname groupBy -#' @aliases group_by +#' @name group_by setMethod("group_by", signature(x = "DataFrame"), function(x, ...) { @@ -804,7 +833,8 @@ setMethod("group_by", #' Compute aggregates by specifying a list of columns #' #' @param x a DataFrame -#' @rdname DataFrame +#' @rdname agg +#' @name agg #' @aliases summarize #' @export setMethod("agg", @@ -813,8 +843,8 @@ setMethod("agg", agg(groupBy(x), ...) }) -#' @rdname DataFrame -#' @aliases agg +#' @rdname agg +#' @name summarize setMethod("summarize", signature(x = "DataFrame"), function(x, ...) { @@ -890,12 +920,14 @@ getColumn <- function(x, c) { } #' @rdname select +#' @name $ setMethod("$", signature(x = "DataFrame"), function(x, name) { getColumn(x, name) }) #' @rdname select +#' @name $<- setMethod("$<-", signature(x = "DataFrame"), function(x, name, value) { stopifnot(class(value) == "Column" || is.null(value)) @@ -922,8 +954,11 @@ setMethod("$<-", signature(x = "DataFrame"), x }) -#' @rdname select -setMethod("[[", signature(x = "DataFrame"), +setClassUnion("numericOrcharacter", c("numeric", "character")) + +#' @rdname subset +#' @name [[ +setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"), function(x, i) { if (is.numeric(i)) { cols <- columns(x) @@ -932,7 +967,8 @@ setMethod("[[", signature(x = "DataFrame"), getColumn(x, i) }) -#' @rdname select +#' @rdname subset +#' @name [ setMethod("[", signature(x = "DataFrame", i = "missing"), function(x, i, j, ...) { if (is.numeric(j)) { @@ -945,6 +981,51 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), select(x, j) }) +#' @rdname subset +#' @name [ +setMethod("[", signature(x = "DataFrame", i = "Column"), + function(x, i, j, ...) { + # It could handle i as "character" but it seems confusing and not required + # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html + filtered <- filter(x, i) + if (!missing(j)) { + filtered[, j, ...] + } else { + filtered + } + }) + +#' Subset +#' +#' Return subsets of DataFrame according to given conditions +#' @param x A DataFrame +#' @param subset A logical expression to filter on rows +#' @param select expression for the single Column or a list of columns to select from the DataFrame +#' @return A new DataFrame containing only the rows that meet the condition with selected columns +#' @export +#' @rdname subset +#' @name subset +#' @aliases [ +#' @family subsetting functions +#' @examples +#' \dontrun{ +#' # Columns can be selected using `[[` and `[` +#' df[[2]] == df[["age"]] +#' df[,2] == df[,"age"] +#' df[,c("name", "age")] +#' # Or to filter rows +#' df[df$age > 20,] +#' # DataFrame can be subset on both rows and Columns +#' df[df$name == "Smith", c(1,2)] +#' df[df$age %in% c(19, 30), 1:2] +#' subset(df, df$age %in% c(19, 30), 1:2) +#' subset(df, df$age %in% c(19), select = c(1,2)) +#' } +setMethod("subset", signature(x = "DataFrame"), + function(x, subset, select, ...) { + x[subset, select, ...] + }) + #' Select #' #' Selects a set of columns with names or Column expressions. @@ -953,6 +1034,8 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), #' @return A new DataFrame with selected columns #' @export #' @rdname select +#' @name select +#' @family subsetting functions #' @examples #' \dontrun{ #' select(df, "*") @@ -960,9 +1043,6 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), #' select(df, df$name, df$age + 1) #' select(df, c("col1", "col2")) #' select(df, list(df$name, df$age + 1)) -#' # Columns can also be selected using `[[` and `[` -#' df[[2]] == df[["age"]] -#' df[,2] == df[,"age"] #' # Similar to R data frames columns can also be selected using `$` #' df$age #' } @@ -1008,6 +1088,7 @@ setMethod("select", #' @param ... Additional expressions #' @return A DataFrame #' @rdname selectExpr +#' @name selectExpr #' @export #' @examples #'\dontrun{ @@ -1034,6 +1115,8 @@ setMethod("selectExpr", #' @param col A Column expression. #' @return A DataFrame with the new column added. #' @rdname withColumn +#' @name withColumn +#' @aliases mutate transform #' @export #' @examples #'\dontrun{ @@ -1053,11 +1136,12 @@ setMethod("withColumn", #' #' Return a new DataFrame with the specified columns added. #' -#' @param x A DataFrame +#' @param .data A DataFrame #' @param col a named argument of the form name = col #' @return A new DataFrame with the new columns added. #' @rdname withColumn -#' @aliases withColumn +#' @name mutate +#' @aliases withColumn transform #' @export #' @examples #'\dontrun{ @@ -1067,10 +1151,12 @@ setMethod("withColumn", #' df <- jsonFile(sqlContext, path) #' newDF <- mutate(df, newCol = df$col1 * 5, newCol2 = df$col1 * 2) #' names(newDF) # Will contain newCol, newCol2 +#' newDF2 <- transform(df, newCol = df$col1 / 5, newCol2 = df$col1 * 2) #' } setMethod("mutate", - signature(x = "DataFrame"), - function(x, ...) { + signature(.data = "DataFrame"), + function(.data, ...) { + x <- .data cols <- list(...) stopifnot(length(cols) > 0) stopifnot(class(cols[[1]]) == "Column") @@ -1085,6 +1171,16 @@ setMethod("mutate", do.call(select, c(x, x$"*", cols)) }) +#' @export +#' @rdname withColumn +#' @name transform +#' @aliases withColumn mutate +setMethod("transform", + signature(`_data` = "DataFrame"), + function(`_data`, ...) { + mutate(`_data`, ...) + }) + #' WithColumnRenamed #' #' Rename an existing column in a DataFrame. @@ -1094,6 +1190,7 @@ setMethod("mutate", #' @param newCol The new column name. #' @return A DataFrame with the column name changed. #' @rdname withColumnRenamed +#' @name withColumnRenamed #' @export #' @examples #'\dontrun{ @@ -1124,6 +1221,7 @@ setMethod("withColumnRenamed", #' @param newCol A named pair of the form new_column_name = existing_column #' @return A DataFrame with the column name changed. #' @rdname withColumnRenamed +#' @name rename #' @aliases withColumnRenamed #' @export #' @examples @@ -1165,6 +1263,8 @@ setClassUnion("characterOrColumn", c("character", "Column")) #' @param ... Additional sorting fields #' @return A DataFrame where all elements are sorted. #' @rdname arrange +#' @name arrange +#' @aliases orderby #' @export #' @examples #'\dontrun{ @@ -1191,7 +1291,7 @@ setMethod("arrange", }) #' @rdname arrange -#' @aliases orderBy,DataFrame,function-method +#' @name orderby setMethod("orderBy", signature(x = "DataFrame", col = "characterOrColumn"), function(x, col) { @@ -1207,6 +1307,8 @@ setMethod("orderBy", #' or a string containing a SQL statement #' @return A DataFrame containing only the rows that meet the condition. #' @rdname filter +#' @name filter +#' @family subsetting functions #' @export #' @examples #'\dontrun{ @@ -1228,7 +1330,7 @@ setMethod("filter", }) #' @rdname filter -#' @aliases where,DataFrame,function-method +#' @name where setMethod("where", signature(x = "DataFrame", condition = "characterOrColumn"), function(x, condition) { @@ -1247,6 +1349,7 @@ setMethod("where", #' 'inner', 'outer', 'left_outer', 'right_outer', 'semijoin'. The default joinType is "inner". #' @return A DataFrame containing the result of the join operation. #' @rdname join +#' @name join #' @export #' @examples #'\dontrun{ @@ -1279,8 +1382,9 @@ setMethod("join", dataFrame(sdf) }) -#' rdname merge -#' aliases join +#' @rdname merge +#' @name merge +#' @aliases join setMethod("merge", signature(x = "DataFrame", y = "DataFrame"), function(x, y, joinExpr = NULL, joinType = NULL, ...) { @@ -1298,6 +1402,7 @@ setMethod("merge", #' @param y A Spark DataFrame #' @return A DataFrame containing the result of the union. #' @rdname unionAll +#' @name unionAll #' @export #' @examples #'\dontrun{ @@ -1319,6 +1424,7 @@ setMethod("unionAll", #' @description Returns a new DataFrame containing rows of all parameters. # #' @rdname rbind +#' @name rbind #' @aliases unionAll setMethod("rbind", signature(... = "DataFrame"), @@ -1339,6 +1445,7 @@ setMethod("rbind", #' @param y A Spark DataFrame #' @return A DataFrame containing the result of the intersect. #' @rdname intersect +#' @name intersect #' @export #' @examples #'\dontrun{ @@ -1364,6 +1471,7 @@ setMethod("intersect", #' @param y A Spark DataFrame #' @return A DataFrame containing the result of the except operation. #' @rdname except +#' @name except #' @export #' @examples #'\dontrun{ @@ -1403,6 +1511,8 @@ setMethod("except", #' @param mode One of 'append', 'overwrite', 'error', 'ignore' #' #' @rdname write.df +#' @name write.df +#' @aliases saveDF #' @export #' @examples #'\dontrun{ @@ -1435,7 +1545,7 @@ setMethod("write.df", }) #' @rdname write.df -#' @aliases saveDF +#' @name saveDF #' @export setMethod("saveDF", signature(df = "DataFrame", path = "character"), @@ -1466,6 +1576,7 @@ setMethod("saveDF", #' @param mode One of 'append', 'overwrite', 'error', 'ignore' #' #' @rdname saveAsTable +#' @name saveAsTable #' @export #' @examples #'\dontrun{ @@ -1505,6 +1616,8 @@ setMethod("saveAsTable", #' @param ... Additional expressions #' @return A DataFrame #' @rdname describe +#' @name describe +#' @aliases summary #' @export #' @examples #'\dontrun{ @@ -1525,6 +1638,7 @@ setMethod("describe", }) #' @rdname describe +#' @name describe setMethod("describe", signature(x = "DataFrame"), function(x) { @@ -1538,7 +1652,7 @@ setMethod("describe", #' @description Computes statistics for numeric columns of the DataFrame #' #' @rdname summary -#' @aliases describe +#' @name summary setMethod("summary", signature(x = "DataFrame"), function(x) { @@ -1562,6 +1676,8 @@ setMethod("summary", #' @return A DataFrame #' #' @rdname nafunctions +#' @name dropna +#' @aliases na.omit #' @export #' @examples #'\dontrun{ @@ -1588,12 +1704,13 @@ setMethod("dropna", dataFrame(sdf) }) -#' @aliases dropna +#' @rdname nafunctions +#' @name na.omit #' @export setMethod("na.omit", - signature(x = "DataFrame"), - function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) { - dropna(x, how, minNonNulls, cols) + signature(object = "DataFrame"), + function(object, how = c("any", "all"), minNonNulls = NULL, cols = NULL) { + dropna(object, how, minNonNulls, cols) }) #' fillna @@ -1615,6 +1732,7 @@ setMethod("na.omit", #' @return A DataFrame #' #' @rdname nafunctions +#' @name fillna #' @export #' @examples #'\dontrun{ @@ -1685,6 +1803,7 @@ setMethod("fillna", #' occurrences will have zero as their counts. #' #' @rdname statfunctions +#' @name crosstab #' @export #' @examples #' \dontrun{ diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 110117a18ccb..1bc644531147 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -201,7 +201,7 @@ setMethod("toDF", signature(x = "RDD"), jsonFile <- function(sqlContext, path) { # Allow the user to have a more flexible definiton of the text file path - path <- normalizePath(path) + path <- suppressWarnings(normalizePath(path)) # Convert a string vector of paths to a string containing comma separated paths path <- paste(path, collapse = ",") sdf <- callJMethod(sqlContext, "jsonFile", path) @@ -251,7 +251,7 @@ jsonRDD <- function(sqlContext, rdd, schema = NULL, samplingRatio = 1.0) { # TODO: Implement saveasParquetFile and write examples for both parquetFile <- function(sqlContext, ...) { # Allow the user to have a more flexible definiton of the text file path - paths <- lapply(list(...), normalizePath) + paths <- lapply(list(...), function(x) suppressWarnings(normalizePath(x))) sdf <- callJMethod(sqlContext, "parquetFile", paths) dataFrame(sdf) } diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index eeaf9f193b72..4805096f3f9c 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -24,10 +24,9 @@ setOldClass("jobj") #' @title S4 class that represents a DataFrame column #' @description The column class supports unary, binary operations on DataFrame columns - #' @rdname column #' -#' @param jc reference to JVM DataFrame column +#' @slot jc reference to JVM DataFrame column #' @export setClass("Column", slots = list(jc = "jobj")) @@ -46,6 +45,7 @@ col <- function(x) { } #' @rdname show +#' @name show setMethod("show", "Column", function(object) { cat("Column", callJMethod(object@jc, "toString"), "\n") @@ -60,12 +60,6 @@ operators <- list( ) column_functions1 <- c("asc", "desc", "isNull", "isNotNull") column_functions2 <- c("like", "rlike", "startsWith", "endsWith", "getField", "getItem", "contains") -functions <- c("min", "max", "sum", "avg", "mean", "count", "abs", "sqrt", - "first", "last", "lower", "upper", "sumDistinct", - "acos", "asin", "atan", "cbrt", "ceiling", "cos", "cosh", "exp", - "expm1", "floor", "log", "log10", "log1p", "rint", "sign", - "sin", "sinh", "tan", "tanh", "toDegrees", "toRadians") -binary_mathfunctions <- c("atan2", "hypot") createOperator <- function(op) { setMethod(op, @@ -111,33 +105,6 @@ createColumnFunction2 <- function(name) { }) } -createStaticFunction <- function(name) { - setMethod(name, - signature(x = "Column"), - function(x) { - if (name == "ceiling") { - name <- "ceil" - } - if (name == "sign") { - name <- "signum" - } - jc <- callJStatic("org.apache.spark.sql.functions", name, x@jc) - column(jc) - }) -} - -createBinaryMathfunctions <- function(name) { - setMethod(name, - signature(y = "Column"), - function(y, x) { - if (class(x) == "Column") { - x <- x@jc - } - jc <- callJStatic("org.apache.spark.sql.functions", name, y@jc, x) - column(jc) - }) -} - createMethods <- function() { for (op in names(operators)) { createOperator(op) @@ -148,12 +115,6 @@ createMethods <- function() { for (name in column_functions2) { createColumnFunction2(name) } - for (x in functions) { - createStaticFunction(x) - } - for (name in binary_mathfunctions) { - createBinaryMathfunctions(name) - } } createMethods() @@ -161,8 +122,11 @@ createMethods() #' alias #' #' Set a new name for a column - -#' @rdname column +#' +#' @rdname alias +#' @name alias +#' @family colum_func +#' @export setMethod("alias", signature(object = "Column"), function(object, data) { @@ -177,7 +141,9 @@ setMethod("alias", #' #' An expression that returns a substring. #' -#' @rdname column +#' @rdname substr +#' @name substr +#' @family colum_func #' #' @param start starting position #' @param stop ending position @@ -191,7 +157,9 @@ setMethod("substr", signature(x = "Column"), #' #' Test if the column is between the lower bound and upper bound, inclusive. #' -#' @rdname column +#' @rdname between +#' @name between +#' @family colum_func #' #' @param bounds lower and upper bounds setMethod("between", signature(x = "Column"), @@ -206,10 +174,11 @@ setMethod("between", signature(x = "Column"), #' Casts the column to a different data type. #' -#' @rdname column +#' @rdname cast +#' @name cast +#' @family colum_func #' -#' @examples -#' \dontrun{ +#' @examples \dontrun{ #' cast(df$age, "string") #' cast(df$name, list(type="array", elementType="byte", containsNull = TRUE)) #' } @@ -229,11 +198,15 @@ setMethod("cast", #' Match a column with given values. #' -#' @rdname column +#' @rdname match +#' @name %in% +#' @aliases %in% #' @return a matched values as a result of comparing with given values. +#' @export +#' @examples #' \dontrun{ -#' filter(df, "age in (10, 30)") -#' where(df, df$age %in% c(10, 30)) +#' filter(df, "age in (10, 30)") +#' where(df, df$age %in% c(10, 30)) #' } setMethod("%in%", signature(x = "Column"), @@ -243,44 +216,19 @@ setMethod("%in%", return(column(jc)) }) -#' Approx Count Distinct +#' otherwise #' -#' @rdname column -#' @return the approximate number of distinct items in a group. -setMethod("approxCountDistinct", - signature(x = "Column"), - function(x, rsd = 0.95) { - jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc, rsd) - column(jc) - }) - -#' Count Distinct +#' If values in the specified column are null, returns the value. +#' Can be used in conjunction with `when` to specify a default value for expressions. #' -#' @rdname column -#' @return the number of distinct items in a group. -setMethod("countDistinct", - signature(x = "Column"), - function(x, ...) { - jcol <- lapply(list(...), function (x) { - x@jc - }) - jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc, - listToSeq(jcol)) +#' @rdname otherwise +#' @name otherwise +#' @family colum_func +#' @export +setMethod("otherwise", + signature(x = "Column", value = "ANY"), + function(x, value) { + value <- ifelse(class(value) == "Column", value@jc, value) + jc <- callJMethod(x@jc, "otherwise", value) column(jc) }) - -#' @rdname column -#' @aliases countDistinct -setMethod("n_distinct", - signature(x = "Column"), - function(x, ...) { - countDistinct(x, ...) - }) - -#' @rdname column -#' @aliases count -setMethod("n", - signature(x = "Column"), - function(x) { - count(x) - }) diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index 6d364f77be7e..33bf13ec9e78 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -176,10 +176,14 @@ readRow <- function(inputCon) { # Take a single column as Array[Byte] and deserialize it into an atomic vector readCol <- function(inputCon, numRows) { - # sapply can not work with POSIXlt - do.call(c, lapply(1:numRows, function(x) { - value <- readObject(inputCon) - # Replace NULL with NA so we can coerce to vectors - if (is.null(value)) NA else value - })) + if (numRows > 0) { + # sapply can not work with POSIXlt + do.call(c, lapply(1:numRows, function(x) { + value <- readObject(inputCon) + # Replace NULL with NA so we can coerce to vectors + if (is.null(value)) NA else value + })) + } else { + vector() + } } diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R new file mode 100644 index 000000000000..d848730e7043 --- /dev/null +++ b/R/pkg/R/functions.R @@ -0,0 +1,1988 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +#' @include generics.R column.R +NULL + +#' Creates a \code{Column} of literal value. +#' +#' The passed in object is returned directly if it is already a \linkS4class{Column}. +#' If the object is a Scala Symbol, it is converted into a \linkS4class{Column} also. +#' Otherwise, a new \linkS4class{Column} is created to represent the literal value. +#' +#' @family normal_funcs +#' @rdname lit +#' @name lit +#' @export +setMethod("lit", signature("ANY"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", + "lit", + ifelse(class(x) == "Column", x@jc, x)) + column(jc) + }) + +#' abs +#' +#' Computes the absolute value. +#' +#' @rdname abs +#' @name abs +#' @family normal_funcs +#' @export +#' @examples \dontrun{abs(df$c)} +setMethod("abs", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "abs", x@jc) + column(jc) + }) + +#' acos +#' +#' Computes the cosine inverse of the given value; the returned angle is in the range +#' 0.0 through pi. +#' +#' @rdname acos +#' @name acos +#' @family math_funcs +#' @export +#' @examples \dontrun{acos(df$c)} +setMethod("acos", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "acos", x@jc) + column(jc) + }) + +#' approxCountDistinct +#' +#' Aggregate function: returns the approximate number of distinct items in a group. +#' +#' @rdname approxCountDistinct +#' @name approxCountDistinct +#' @family agg_funcs +#' @export +#' @examples \dontrun{approxCountDistinct(df$c)} +setMethod("approxCountDistinct", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc) + column(jc) + }) + +#' ascii +#' +#' Computes the numeric value of the first character of the string column, and returns the +#' result as a int column. +#' +#' @rdname ascii +#' @name ascii +#' @family string_funcs +#' @export +#' @examples \dontrun{\dontrun{ascii(df$c)}} +setMethod("ascii", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "ascii", x@jc) + column(jc) + }) + +#' asin +#' +#' Computes the sine inverse of the given value; the returned angle is in the range +#' -pi/2 through pi/2. +#' +#' @rdname asin +#' @name asin +#' @family math_funcs +#' @export +#' @examples \dontrun{asin(df$c)} +setMethod("asin", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "asin", x@jc) + column(jc) + }) + +#' atan +#' +#' Computes the tangent inverse of the given value. +#' +#' @rdname atan +#' @name atan +#' @family math_funcs +#' @export +#' @examples \dontrun{atan(df$c)} +setMethod("atan", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "atan", x@jc) + column(jc) + }) + +#' avg +#' +#' Aggregate function: returns the average of the values in a group. +#' +#' @rdname avg +#' @name avg +#' @family agg_funcs +#' @export +#' @examples \dontrun{avg(df$c)} +setMethod("avg", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "avg", x@jc) + column(jc) + }) + +#' base64 +#' +#' Computes the BASE64 encoding of a binary column and returns it as a string column. +#' This is the reverse of unbase64. +#' +#' @rdname base64 +#' @name base64 +#' @family string_funcs +#' @export +#' @examples \dontrun{base64(df$c)} +setMethod("base64", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "base64", x@jc) + column(jc) + }) + +#' bin +#' +#' An expression that returns the string representation of the binary value of the given long +#' column. For example, bin("12") returns "1100". +#' +#' @rdname bin +#' @name bin +#' @family math_funcs +#' @export +#' @examples \dontrun{bin(df$c)} +setMethod("bin", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "bin", x@jc) + column(jc) + }) + +#' bitwiseNOT +#' +#' Computes bitwise NOT. +#' +#' @rdname bitwiseNOT +#' @name bitwiseNOT +#' @family normal_funcs +#' @export +#' @examples \dontrun{bitwiseNOT(df$c)} +setMethod("bitwiseNOT", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "bitwiseNOT", x@jc) + column(jc) + }) + +#' cbrt +#' +#' Computes the cube-root of the given value. +#' +#' @rdname cbrt +#' @name cbrt +#' @family math_funcs +#' @export +#' @examples \dontrun{cbrt(df$c)} +setMethod("cbrt", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "cbrt", x@jc) + column(jc) + }) + +#' ceil +#' +#' Computes the ceiling of the given value. +#' +#' @rdname ceil +#' @name ceil +#' @family math_funcs +#' @export +#' @examples \dontrun{ceil(df$c)} +setMethod("ceil", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "ceil", x@jc) + column(jc) + }) + +#' cos +#' +#' Computes the cosine of the given value. +#' +#' @rdname cos +#' @name cos +#' @family math_funcs +#' @export +#' @examples \dontrun{cos(df$c)} +setMethod("cos", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "cos", x@jc) + column(jc) + }) + +#' cosh +#' +#' Computes the hyperbolic cosine of the given value. +#' +#' @rdname cosh +#' @name cosh +#' @family math_funcs +#' @export +#' @examples \dontrun{cosh(df$c)} +setMethod("cosh", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "cosh", x@jc) + column(jc) + }) + +#' count +#' +#' Aggregate function: returns the number of items in a group. +#' +#' @rdname count +#' @name count +#' @family agg_funcs +#' @export +#' @examples \dontrun{count(df$c)} +setMethod("count", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "count", x@jc) + column(jc) + }) + +#' crc32 +#' +#' Calculates the cyclic redundancy check value (CRC32) of a binary column and +#' returns the value as a bigint. +#' +#' @rdname crc32 +#' @name crc32 +#' @family misc_funcs +#' @export +#' @examples \dontrun{crc32(df$c)} +setMethod("crc32", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "crc32", x@jc) + column(jc) + }) + +#' dayofmonth +#' +#' Extracts the day of the month as an integer from a given date/timestamp/string. +#' +#' @rdname dayofmonth +#' @name dayofmonth +#' @family datetime_funcs +#' @export +#' @examples \dontrun{dayofmonth(df$c)} +setMethod("dayofmonth", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "dayofmonth", x@jc) + column(jc) + }) + +#' dayofyear +#' +#' Extracts the day of the year as an integer from a given date/timestamp/string. +#' +#' @rdname dayofyear +#' @name dayofyear +#' @family datetime_funcs +#' @export +#' @examples \dontrun{dayofyear(df$c)} +setMethod("dayofyear", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "dayofyear", x@jc) + column(jc) + }) + +#' exp +#' +#' Computes the exponential of the given value. +#' +#' @rdname exp +#' @name exp +#' @family math_funcs +#' @export +#' @examples \dontrun{exp(df$c)} +setMethod("exp", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "exp", x@jc) + column(jc) + }) + +#' explode +#' +#' Creates a new row for each element in the given array or map column. +#' +#' @rdname explode +#' @name explode +#' @family collection_funcs +#' @export +#' @examples \dontrun{explode(df$c)} +setMethod("explode", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "explode", x@jc) + column(jc) + }) + +#' expm1 +#' +#' Computes the exponential of the given value minus one. +#' +#' @rdname expm1 +#' @name expm1 +#' @family math_funcs +#' @export +#' @examples \dontrun{expm1(df$c)} +setMethod("expm1", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "expm1", x@jc) + column(jc) + }) + +#' factorial +#' +#' Computes the factorial of the given value. +#' +#' @rdname factorial +#' @name factorial +#' @family math_funcs +#' @export +#' @examples \dontrun{factorial(df$c)} +setMethod("factorial", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "factorial", x@jc) + column(jc) + }) + +#' first +#' +#' Aggregate function: returns the first value in a group. +#' +#' @rdname first +#' @name first +#' @family agg_funcs +#' @export +#' @examples \dontrun{first(df$c)} +setMethod("first", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "first", x@jc) + column(jc) + }) + +#' floor +#' +#' Computes the floor of the given value. +#' +#' @rdname floor +#' @name floor +#' @family math_funcs +#' @export +#' @examples \dontrun{floor(df$c)} +setMethod("floor", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "floor", x@jc) + column(jc) + }) + +#' hex +#' +#' Computes hex value of the given column. +#' +#' @rdname hex +#' @name hex +#' @family math_funcs +#' @export +#' @examples \dontrun{hex(df$c)} +setMethod("hex", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "hex", x@jc) + column(jc) + }) + +#' hour +#' +#' Extracts the hours as an integer from a given date/timestamp/string. +#' +#' @rdname hour +#' @name hour +#' @family datetime_funcs +#' @export +#' @examples \dontrun{hour(df$c)} +setMethod("hour", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "hour", x@jc) + column(jc) + }) + +#' initcap +#' +#' Returns a new string column by converting the first letter of each word to uppercase. +#' Words are delimited by whitespace. +#' +#' For example, "hello world" will become "Hello World". +#' +#' @rdname initcap +#' @name initcap +#' @family string_funcs +#' @export +#' @examples \dontrun{initcap(df$c)} +setMethod("initcap", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "initcap", x@jc) + column(jc) + }) + +#' isNaN +#' +#' Return true iff the column is NaN. +#' +#' @rdname isNaN +#' @name isNaN +#' @family normal_funcs +#' @export +#' @examples \dontrun{isNaN(df$c)} +setMethod("isNaN", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "isNaN", x@jc) + column(jc) + }) + +#' last +#' +#' Aggregate function: returns the last value in a group. +#' +#' @rdname last +#' @name last +#' @family agg_funcs +#' @export +#' @examples \dontrun{last(df$c)} +setMethod("last", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "last", x@jc) + column(jc) + }) + +#' last_day +#' +#' Given a date column, returns the last day of the month which the given date belongs to. +#' For example, input "2015-07-27" returns "2015-07-31" since July 31 is the last day of the +#' month in July 2015. +#' +#' @rdname last_day +#' @name last_day +#' @family datetime_funcs +#' @export +#' @examples \dontrun{last_day(df$c)} +setMethod("last_day", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "last_day", x@jc) + column(jc) + }) + +#' length +#' +#' Computes the length of a given string or binary column. +#' +#' @rdname length +#' @name length +#' @family string_funcs +#' @export +#' @examples \dontrun{length(df$c)} +setMethod("length", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "length", x@jc) + column(jc) + }) + +#' log +#' +#' Computes the natural logarithm of the given value. +#' +#' @rdname log +#' @name log +#' @family math_funcs +#' @export +#' @examples \dontrun{log(df$c)} +setMethod("log", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "log", x@jc) + column(jc) + }) + +#' log10 +#' +#' Computes the logarithm of the given value in base 10. +#' +#' @rdname log10 +#' @name log10 +#' @family math_funcs +#' @export +#' @examples \dontrun{log10(df$c)} +setMethod("log10", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "log10", x@jc) + column(jc) + }) + +#' log1p +#' +#' Computes the natural logarithm of the given value plus one. +#' +#' @rdname log1p +#' @name log1p +#' @family math_funcs +#' @export +#' @examples \dontrun{log1p(df$c)} +setMethod("log1p", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "log1p", x@jc) + column(jc) + }) + +#' log2 +#' +#' Computes the logarithm of the given column in base 2. +#' +#' @rdname log2 +#' @name log2 +#' @family math_funcs +#' @export +#' @examples \dontrun{log2(df$c)} +setMethod("log2", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "log2", x@jc) + column(jc) + }) + +#' lower +#' +#' Converts a string column to lower case. +#' +#' @rdname lower +#' @name lower +#' @family string_funcs +#' @export +#' @examples \dontrun{lower(df$c)} +setMethod("lower", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "lower", x@jc) + column(jc) + }) + +#' ltrim +#' +#' Trim the spaces from left end for the specified string value. +#' +#' @rdname ltrim +#' @name ltrim +#' @family string_funcs +#' @export +#' @examples \dontrun{ltrim(df$c)} +setMethod("ltrim", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "ltrim", x@jc) + column(jc) + }) + +#' max +#' +#' Aggregate function: returns the maximum value of the expression in a group. +#' +#' @rdname max +#' @name max +#' @family agg_funcs +#' @export +#' @examples \dontrun{max(df$c)} +setMethod("max", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "max", x@jc) + column(jc) + }) + +#' md5 +#' +#' Calculates the MD5 digest of a binary column and returns the value +#' as a 32 character hex string. +#' +#' @rdname md5 +#' @name md5 +#' @family misc_funcs +#' @export +#' @examples \dontrun{md5(df$c)} +setMethod("md5", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "md5", x@jc) + column(jc) + }) + +#' mean +#' +#' Aggregate function: returns the average of the values in a group. +#' Alias for avg. +#' +#' @rdname mean +#' @name mean +#' @family agg_funcs +#' @export +#' @examples \dontrun{mean(df$c)} +setMethod("mean", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "mean", x@jc) + column(jc) + }) + +#' min +#' +#' Aggregate function: returns the minimum value of the expression in a group. +#' +#' @rdname min +#' @name min +#' @family agg_funcs +#' @export +#' @examples \dontrun{min(df$c)} +setMethod("min", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "min", x@jc) + column(jc) + }) + +#' minute +#' +#' Extracts the minutes as an integer from a given date/timestamp/string. +#' +#' @rdname minute +#' @name minute +#' @family datetime_funcs +#' @export +#' @examples \dontrun{minute(df$c)} +setMethod("minute", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "minute", x@jc) + column(jc) + }) + +#' month +#' +#' Extracts the month as an integer from a given date/timestamp/string. +#' +#' @rdname month +#' @name month +#' @family datetime_funcs +#' @export +#' @examples \dontrun{month(df$c)} +setMethod("month", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "month", x@jc) + column(jc) + }) + +#' negate +#' +#' Unary minus, i.e. negate the expression. +#' +#' @rdname negate +#' @name negate +#' @family normal_funcs +#' @export +#' @examples \dontrun{negate(df$c)} +setMethod("negate", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "negate", x@jc) + column(jc) + }) + +#' quarter +#' +#' Extracts the quarter as an integer from a given date/timestamp/string. +#' +#' @rdname quarter +#' @name quarter +#' @family datetime_funcs +#' @export +#' @examples \dontrun{quarter(df$c)} +setMethod("quarter", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "quarter", x@jc) + column(jc) + }) + +#' reverse +#' +#' Reverses the string column and returns it as a new string column. +#' +#' @rdname reverse +#' @name reverse +#' @family string_funcs +#' @export +#' @examples \dontrun{reverse(df$c)} +setMethod("reverse", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "reverse", x@jc) + column(jc) + }) + +#' rint +#' +#' Returns the double value that is closest in value to the argument and +#' is equal to a mathematical integer. +#' +#' @rdname rint +#' @name rint +#' @family math_funcs +#' @export +#' @examples \dontrun{rint(df$c)} +setMethod("rint", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "rint", x@jc) + column(jc) + }) + +#' round +#' +#' Returns the value of the column `e` rounded to 0 decimal places. +#' +#' @rdname round +#' @name round +#' @family math_funcs +#' @export +#' @examples \dontrun{round(df$c)} +setMethod("round", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "round", x@jc) + column(jc) + }) + +#' rtrim +#' +#' Trim the spaces from right end for the specified string value. +#' +#' @rdname rtrim +#' @name rtrim +#' @family string_funcs +#' @export +#' @examples \dontrun{rtrim(df$c)} +setMethod("rtrim", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "rtrim", x@jc) + column(jc) + }) + +#' second +#' +#' Extracts the seconds as an integer from a given date/timestamp/string. +#' +#' @rdname second +#' @name second +#' @family datetime_funcs +#' @export +#' @examples \dontrun{second(df$c)} +setMethod("second", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "second", x@jc) + column(jc) + }) + +#' sha1 +#' +#' Calculates the SHA-1 digest of a binary column and returns the value +#' as a 40 character hex string. +#' +#' @rdname sha1 +#' @name sha1 +#' @family misc_funcs +#' @export +#' @examples \dontrun{sha1(df$c)} +setMethod("sha1", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "sha1", x@jc) + column(jc) + }) + +#' signum +#' +#' Computes the signum of the given value. +#' +#' @rdname signum +#' @name signum +#' @family math_funcs +#' @export +#' @examples \dontrun{signum(df$c)} +setMethod("signum", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "signum", x@jc) + column(jc) + }) + +#' sin +#' +#' Computes the sine of the given value. +#' +#' @rdname sin +#' @name sin +#' @family math_funcs +#' @export +#' @examples \dontrun{sin(df$c)} +setMethod("sin", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "sin", x@jc) + column(jc) + }) + +#' sinh +#' +#' Computes the hyperbolic sine of the given value. +#' +#' @rdname sinh +#' @name sinh +#' @family math_funcs +#' @export +#' @examples \dontrun{sinh(df$c)} +setMethod("sinh", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "sinh", x@jc) + column(jc) + }) + +#' size +#' +#' Returns length of array or map. +#' +#' @rdname size +#' @name size +#' @family collection_funcs +#' @export +#' @examples \dontrun{size(df$c)} +setMethod("size", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "size", x@jc) + column(jc) + }) + +#' soundex +#' +#' Return the soundex code for the specified expression. +#' +#' @rdname soundex +#' @name soundex +#' @family string_funcs +#' @export +#' @examples \dontrun{soundex(df$c)} +setMethod("soundex", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "soundex", x@jc) + column(jc) + }) + +#' sqrt +#' +#' Computes the square root of the specified float value. +#' +#' @rdname sqrt +#' @name sqrt +#' @family math_funcs +#' @export +#' @examples \dontrun{sqrt(df$c)} +setMethod("sqrt", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "sqrt", x@jc) + column(jc) + }) + +#' sum +#' +#' Aggregate function: returns the sum of all values in the expression. +#' +#' @rdname sum +#' @name sum +#' @family agg_funcs +#' @export +#' @examples \dontrun{sum(df$c)} +setMethod("sum", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "sum", x@jc) + column(jc) + }) + +#' sumDistinct +#' +#' Aggregate function: returns the sum of distinct values in the expression. +#' +#' @rdname sumDistinct +#' @name sumDistinct +#' @family agg_funcs +#' @export +#' @examples \dontrun{sumDistinct(df$c)} +setMethod("sumDistinct", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "sumDistinct", x@jc) + column(jc) + }) + +#' tan +#' +#' Computes the tangent of the given value. +#' +#' @rdname tan +#' @name tan +#' @family math_funcs +#' @export +#' @examples \dontrun{tan(df$c)} +setMethod("tan", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "tan", x@jc) + column(jc) + }) + +#' tanh +#' +#' Computes the hyperbolic tangent of the given value. +#' +#' @rdname tanh +#' @name tanh +#' @family math_funcs +#' @export +#' @examples \dontrun{tanh(df$c)} +setMethod("tanh", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "tanh", x@jc) + column(jc) + }) + +#' toDegrees +#' +#' Converts an angle measured in radians to an approximately equivalent angle measured in degrees. +#' +#' @rdname toDegrees +#' @name toDegrees +#' @family math_funcs +#' @export +#' @examples \dontrun{toDegrees(df$c)} +setMethod("toDegrees", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "toDegrees", x@jc) + column(jc) + }) + +#' toRadians +#' +#' Converts an angle measured in degrees to an approximately equivalent angle measured in radians. +#' +#' @rdname toRadians +#' @name toRadians +#' @family math_funcs +#' @export +#' @examples \dontrun{toRadians(df$c)} +setMethod("toRadians", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "toRadians", x@jc) + column(jc) + }) + +#' to_date +#' +#' Converts the column into DateType. +#' +#' @rdname to_date +#' @name to_date +#' @family datetime_funcs +#' @export +#' @examples \dontrun{to_date(df$c)} +setMethod("to_date", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "to_date", x@jc) + column(jc) + }) + +#' trim +#' +#' Trim the spaces from both ends for the specified string column. +#' +#' @rdname trim +#' @name trim +#' @family string_funcs +#' @export +#' @examples \dontrun{trim(df$c)} +setMethod("trim", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "trim", x@jc) + column(jc) + }) + +#' unbase64 +#' +#' Decodes a BASE64 encoded string column and returns it as a binary column. +#' This is the reverse of base64. +#' +#' @rdname unbase64 +#' @name unbase64 +#' @family string_funcs +#' @export +#' @examples \dontrun{unbase64(df$c)} +setMethod("unbase64", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "unbase64", x@jc) + column(jc) + }) + +#' unhex +#' +#' Inverse of hex. Interprets each pair of characters as a hexadecimal number +#' and converts to the byte representation of number. +#' +#' @rdname unhex +#' @name unhex +#' @family math_funcs +#' @export +#' @examples \dontrun{unhex(df$c)} +setMethod("unhex", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "unhex", x@jc) + column(jc) + }) + +#' upper +#' +#' Converts a string column to upper case. +#' +#' @rdname upper +#' @name upper +#' @family string_funcs +#' @export +#' @examples \dontrun{upper(df$c)} +setMethod("upper", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "upper", x@jc) + column(jc) + }) + +#' weekofyear +#' +#' Extracts the week number as an integer from a given date/timestamp/string. +#' +#' @rdname weekofyear +#' @name weekofyear +#' @family datetime_funcs +#' @export +#' @examples \dontrun{weekofyear(df$c)} +setMethod("weekofyear", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "weekofyear", x@jc) + column(jc) + }) + +#' year +#' +#' Extracts the year as an integer from a given date/timestamp/string. +#' +#' @rdname year +#' @name year +#' @family datetime_funcs +#' @export +#' @examples \dontrun{year(df$c)} +setMethod("year", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "year", x@jc) + column(jc) + }) + +#' atan2 +#' +#' Returns the angle theta from the conversion of rectangular coordinates (x, y) to +#' polar coordinates (r, theta). +#' +#' @rdname atan2 +#' @name atan2 +#' @family math_funcs +#' @export +#' @examples \dontrun{atan2(df$c, x)} +setMethod("atan2", signature(y = "Column"), + function(y, x) { + if (class(x) == "Column") { + x <- x@jc + } + jc <- callJStatic("org.apache.spark.sql.functions", "atan2", y@jc, x) + column(jc) + }) + +#' datediff +#' +#' Returns the number of days from `start` to `end`. +#' +#' @rdname datediff +#' @name datediff +#' @family datetime_funcs +#' @export +#' @examples \dontrun{datediff(df$c, x)} +setMethod("datediff", signature(y = "Column"), + function(y, x) { + if (class(x) == "Column") { + x <- x@jc + } + jc <- callJStatic("org.apache.spark.sql.functions", "datediff", y@jc, x) + column(jc) + }) + +#' hypot +#' +#' Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. +#' +#' @rdname hypot +#' @name hypot +#' @family math_funcs +#' @export +#' @examples \dontrun{hypot(df$c, x)} +setMethod("hypot", signature(y = "Column"), + function(y, x) { + if (class(x) == "Column") { + x <- x@jc + } + jc <- callJStatic("org.apache.spark.sql.functions", "hypot", y@jc, x) + column(jc) + }) + +#' levenshtein +#' +#' Computes the Levenshtein distance of the two given string columns. +#' +#' @rdname levenshtein +#' @name levenshtein +#' @family string_funcs +#' @export +#' @examples \dontrun{levenshtein(df$c, x)} +setMethod("levenshtein", signature(y = "Column"), + function(y, x) { + if (class(x) == "Column") { + x <- x@jc + } + jc <- callJStatic("org.apache.spark.sql.functions", "levenshtein", y@jc, x) + column(jc) + }) + +#' months_between +#' +#' Returns number of months between dates `date1` and `date2`. +#' +#' @rdname months_between +#' @name months_between +#' @family datetime_funcs +#' @export +#' @examples \dontrun{months_between(df$c, x)} +setMethod("months_between", signature(y = "Column"), + function(y, x) { + if (class(x) == "Column") { + x <- x@jc + } + jc <- callJStatic("org.apache.spark.sql.functions", "months_between", y@jc, x) + column(jc) + }) + +#' nanvl +#' +#' Returns col1 if it is not NaN, or col2 if col1 is NaN. +#' hhBoth inputs should be floating point columns (DoubleType or FloatType). +#' +#' @rdname nanvl +#' @name nanvl +#' @family normal_funcs +#' @export +#' @examples \dontrun{nanvl(df$c, x)} +setMethod("nanvl", signature(y = "Column"), + function(y, x) { + if (class(x) == "Column") { + x <- x@jc + } + jc <- callJStatic("org.apache.spark.sql.functions", "nanvl", y@jc, x) + column(jc) + }) + +#' pmod +#' +#' Returns the positive value of dividend mod divisor. +#' +#' @rdname pmod +#' @name pmod +#' @docType methods +#' @family math_funcs +#' @export +#' @examples \dontrun{pmod(df$c, x)} +setMethod("pmod", signature(y = "Column"), + function(y, x) { + if (class(x) == "Column") { + x <- x@jc + } + jc <- callJStatic("org.apache.spark.sql.functions", "pmod", y@jc, x) + column(jc) + }) + + +#' Approx Count Distinct +#' +#' @family agg_funcs +#' @rdname approxCountDistinct +#' @name approxCountDistinct +#' @return the approximate number of distinct items in a group. +#' @export +setMethod("approxCountDistinct", + signature(x = "Column"), + function(x, rsd = 0.95) { + jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc, rsd) + column(jc) + }) + +#' Count Distinct +#' +#' @family agg_funcs +#' @rdname countDistinct +#' @name countDistinct +#' @return the number of distinct items in a group. +#' @export +setMethod("countDistinct", + signature(x = "Column"), + function(x, ...) { + jcol <- lapply(list(...), function (x) { + x@jc + }) + jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc, + listToSeq(jcol)) + column(jc) + }) + + +#' concat +#' +#' Concatenates multiple input string columns together into a single string column. +#' +#' @family string_funcs +#' @rdname concat +#' @name concat +#' @export +setMethod("concat", + signature(x = "Column"), + function(x, ...) { + jcols <- lapply(list(x, ...), function(x) { x@jc }) + jc <- callJStatic("org.apache.spark.sql.functions", "concat", listToSeq(jcols)) + column(jc) + }) + +#' greatest +#' +#' Returns the greatest value of the list of column names, skipping null values. +#' This function takes at least 2 parameters. It will return null if all parameters are null. +#' +#' @family normal_funcs +#' @rdname greatest +#' @name greatest +#' @export +setMethod("greatest", + signature(x = "Column"), + function(x, ...) { + stopifnot(length(list(...)) > 0) + jcols <- lapply(list(x, ...), function(x) { x@jc }) + jc <- callJStatic("org.apache.spark.sql.functions", "greatest", listToSeq(jcols)) + column(jc) + }) + +#' least +#' +#' Returns the least value of the list of column names, skipping null values. +#' This function takes at least 2 parameters. It will return null iff all parameters are null. +#' +#' @family normal_funcs +#' @rdname least +#' @name least +#' @export +setMethod("least", + signature(x = "Column"), + function(x, ...) { + stopifnot(length(list(...)) > 0) + jcols <- lapply(list(x, ...), function(x) { x@jc }) + jc <- callJStatic("org.apache.spark.sql.functions", "least", listToSeq(jcols)) + column(jc) + }) + +#' ceiling +#' +#' Computes the ceiling of the given value. +#' +#' @family math_funcs +#' @rdname ceil +#' @name ceil +#' @aliases ceil +#' @export +setMethod("ceiling", + signature(x = "Column"), + function(x) { + ceil(x) + }) + +#' sign +#' +#' Computes the signum of the given value. +#' +#' @family math_funcs +#' @rdname signum +#' @name signum +#' @aliases signum +#' @export +setMethod("sign", signature(x = "Column"), + function(x) { + signum(x) + }) + +#' n_distinct +#' +#' Aggregate function: returns the number of distinct items in a group. +#' +#' @family agg_funcs +#' @rdname countDistinct +#' @name countDistinct +#' @aliases countDistinct +#' @export +setMethod("n_distinct", signature(x = "Column"), + function(x, ...) { + countDistinct(x, ...) + }) + +#' n +#' +#' Aggregate function: returns the number of items in a group. +#' +#' @family agg_funcs +#' @rdname count +#' @name count +#' @aliases count +#' @export +setMethod("n", signature(x = "Column"), + function(x) { + count(x) + }) + +#' date_format +#' +#' Converts a date/timestamp/string to a value of string in the format specified by the date +#' format given by the second argument. +#' +#' A pattern could be for instance \preformatted{dd.MM.yyyy} and could return a string like '18.03.1993'. All +#' pattern letters of \code{java.text.SimpleDateFormat} can be used. +#' +#' NOTE: Use when ever possible specialized functions like \code{year}. These benefit from a +#' specialized implementation. +#' +#' @family datetime_funcs +#' @rdname date_format +#' @name date_format +#' @export +setMethod("date_format", signature(y = "Column", x = "character"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", "date_format", y@jc, x) + column(jc) + }) + +#' from_utc_timestamp +#' +#' Assumes given timestamp is UTC and converts to given timezone. +#' +#' @family datetime_funcs +#' @rdname from_utc_timestamp +#' @name from_utc_timestamp +#' @export +setMethod("from_utc_timestamp", signature(y = "Column", x = "character"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", "from_utc_timestamp", y@jc, x) + column(jc) + }) + +#' instr +#' +#' Locate the position of the first occurrence of substr column in the given string. +#' Returns null if either of the arguments are null. +#' +#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr +#' could not be found in str. +#' +#' @family string_funcs +#' @rdname instr +#' @name instr +#' @export +setMethod("instr", signature(y = "Column", x = "character"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", "instr", y@jc, x) + column(jc) + }) + +#' next_day +#' +#' Given a date column, returns the first date which is later than the value of the date column +#' that is on the specified day of the week. +#' +#' For example, \code{next_day('2015-07-27', "Sunday")} returns 2015-08-02 because that is the first +#' Sunday after 2015-07-27. +#' +#' Day of the week parameter is case insensitive, and accepts: +#' "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun". +#' +#' @family datetime_funcs +#' @rdname next_day +#' @name next_day +#' @export +setMethod("next_day", signature(y = "Column", x = "character"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", "next_day", y@jc, x) + column(jc) + }) + +#' to_utc_timestamp +#' +#' Assumes given timestamp is in given timezone and converts to UTC. +#' +#' @family datetime_funcs +#' @rdname to_utc_timestamp +#' @name to_utc_timestamp +#' @export +setMethod("to_utc_timestamp", signature(y = "Column", x = "character"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", "to_utc_timestamp", y@jc, x) + column(jc) + }) + +#' add_months +#' +#' Returns the date that is numMonths after startDate. +#' +#' @name add_months +#' @family datetime_funcs +#' @rdname add_months +#' @name add_months +#' @export +setMethod("add_months", signature(y = "Column", x = "numeric"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", "add_months", y@jc, as.integer(x)) + column(jc) + }) + +#' date_add +#' +#' Returns the date that is `days` days after `start` +#' +#' @family datetime_funcs +#' @rdname date_add +#' @name date_add +#' @export +setMethod("date_add", signature(y = "Column", x = "numeric"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", "date_add", y@jc, as.integer(x)) + column(jc) + }) + +#' date_sub +#' +#' Returns the date that is `days` days before `start` +#' +#' @family datetime_funcs +#' @rdname date_sub +#' @name date_sub +#' @export +setMethod("date_sub", signature(y = "Column", x = "numeric"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", "date_sub", y@jc, as.integer(x)) + column(jc) + }) + +#' format_number +#' +#' Formats numeric column x to a format like '#,###,###.##', rounded to d decimal places, +#' and returns the result as a string column. +#' +#' If d is 0, the result has no decimal point or fractional part. +#' If d < 0, the result will be null.' +#' +#' @family string_funcs +#' @rdname format_number +#' @name format_number +#' @export +setMethod("format_number", signature(y = "Column", x = "numeric"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", + "format_number", + y@jc, as.integer(x)) + column(jc) + }) + +#' sha2 +#' +#' Calculates the SHA-2 family of hash functions of a binary column and +#' returns the value as a hex string. +#' +#' @param y column to compute SHA-2 on. +#' @param x one of 224, 256, 384, or 512. +#' @family misc_funcs +#' @rdname sha2 +#' @name sha2 +#' @export +setMethod("sha2", signature(y = "Column", x = "numeric"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", "sha2", y@jc, as.integer(x)) + column(jc) + }) + +#' shiftLeft +#' +#' Shift the the given value numBits left. If the given value is a long value, this function +#' will return a long value else it will return an integer value. +#' +#' @family math_funcs +#' @rdname shiftLeft +#' @name shiftLeft +#' @export +setMethod("shiftLeft", signature(y = "Column", x = "numeric"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", + "shiftLeft", + y@jc, as.integer(x)) + column(jc) + }) + +#' shiftRight +#' +#' Shift the the given value numBits right. If the given value is a long value, it will return +#' a long value else it will return an integer value. +#' +#' @family math_funcs +#' @rdname shiftRight +#' @name shiftRight +#' @export +setMethod("shiftRight", signature(y = "Column", x = "numeric"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", + "shiftRight", + y@jc, as.integer(x)) + column(jc) + }) + +#' shiftRightUnsigned +#' +#' Unsigned shift the the given value numBits right. If the given value is a long value, +#' it will return a long value else it will return an integer value. +#' +#' @family math_funcs +#' @rdname shiftRightUnsigned +#' @name shiftRightUnsigned +#' @export +setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"), + function(y, x) { + jc <- callJStatic("org.apache.spark.sql.functions", + "shiftRightUnsigned", + y@jc, as.integer(x)) + column(jc) + }) + +#' concat_ws +#' +#' Concatenates multiple input string columns together into a single string column, +#' using the given separator. +#' +#' @family string_funcs +#' @rdname concat_ws +#' @name concat_ws +#' @export +setMethod("concat_ws", signature(sep = "character", x = "Column"), + function(sep, x, ...) { + jcols <- listToSeq(lapply(list(x, ...), function(x) { x@jc })) + jc <- callJStatic("org.apache.spark.sql.functions", "concat_ws", sep, jcols) + column(jc) + }) + +#' conv +#' +#' Convert a number in a string column from one base to another. +#' +#' @family math_funcs +#' @rdname conv +#' @name conv +#' @export +setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = "numeric"), + function(x, fromBase, toBase) { + fromBase <- as.integer(fromBase) + toBase <- as.integer(toBase) + jc <- callJStatic("org.apache.spark.sql.functions", + "conv", + x@jc, fromBase, toBase) + column(jc) + }) + +#' expr +#' +#' Parses the expression string into the column that it represents, similar to +#' DataFrame.selectExpr +#' +#' @family normal_funcs +#' @rdname expr +#' @name expr +#' @export +setMethod("expr", signature(x = "character"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "expr", x) + column(jc) + }) + +#' format_string +#' +#' Formats the arguments in printf-style and returns the result as a string column. +#' +#' @family string_funcs +#' @rdname format_string +#' @name format_string +#' @export +setMethod("format_string", signature(format = "character", x = "Column"), + function(format, x, ...) { + jcols <- listToSeq(lapply(list(x, ...), function(arg) { arg@jc })) + jc <- callJStatic("org.apache.spark.sql.functions", + "format_string", + format, jcols) + column(jc) + }) + +#' from_unixtime +#' +#' Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string +#' representing the timestamp of that moment in the current system time zone in the given +#' format. +#' +#' @family datetime_funcs +#' @rdname from_unixtime +#' @name from_unixtime +#' @export +setMethod("from_unixtime", signature(x = "Column"), + function(x, format = "yyyy-MM-dd HH:mm:ss") { + jc <- callJStatic("org.apache.spark.sql.functions", + "from_unixtime", + x@jc, format) + column(jc) + }) + +#' locate +#' +#' Locate the position of the first occurrence of substr. +#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr +#' could not be found in str. +#' +#' @family string_funcs +#' @rdname locate +#' @name locate +#' @export +setMethod("locate", signature(substr = "character", str = "Column"), + function(substr, str, pos = 0) { + jc <- callJStatic("org.apache.spark.sql.functions", + "locate", + substr, str@jc, as.integer(pos)) + column(jc) + }) + +#' lpad +#' +#' Left-pad the string column with +#' +#' @family string_funcs +#' @rdname lpad +#' @name lpad +#' @export +setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"), + function(x, len, pad) { + jc <- callJStatic("org.apache.spark.sql.functions", + "lpad", + x@jc, as.integer(len), pad) + column(jc) + }) + +#' rand +#' +#' Generate a random column with i.i.d. samples from U[0.0, 1.0]. +#' +#' @family normal_funcs +#' @rdname rand +#' @name rand +#' @export +setMethod("rand", signature(seed = "missing"), + function(seed) { + jc <- callJStatic("org.apache.spark.sql.functions", "rand") + column(jc) + }) +#' @family normal_funcs +#' @rdname rand +#' @name rand +#' @export +setMethod("rand", signature(seed = "numeric"), + function(seed) { + jc <- callJStatic("org.apache.spark.sql.functions", "rand", as.integer(seed)) + column(jc) + }) + +#' randn +#' +#' Generate a column with i.i.d. samples from the standard normal distribution. +#' +#' @family normal_funcs +#' @rdname randn +#' @name randn +#' @export +setMethod("randn", signature(seed = "missing"), + function(seed) { + jc <- callJStatic("org.apache.spark.sql.functions", "randn") + column(jc) + }) +#' @family normal_funcs +#' @rdname randn +#' @name randn +#' @export +setMethod("randn", signature(seed = "numeric"), + function(seed) { + jc <- callJStatic("org.apache.spark.sql.functions", "randn", as.integer(seed)) + column(jc) + }) + +#' regexp_extract +#' +#' Extract a specific(idx) group identified by a java regex, from the specified string column. +#' +#' @family string_funcs +#' @rdname regexp_extract +#' @name regexp_extract +#' @export +setMethod("regexp_extract", + signature(x = "Column", pattern = "character", idx = "numeric"), + function(x, pattern, idx) { + jc <- callJStatic("org.apache.spark.sql.functions", + "regexp_extract", + x@jc, pattern, as.integer(idx)) + column(jc) + }) + +#' regexp_replace +#' +#' Replace all substrings of the specified string value that match regexp with rep. +#' +#' @family string_funcs +#' @rdname regexp_replace +#' @name regexp_replace +#' @export +setMethod("regexp_replace", + signature(x = "Column", pattern = "character", replacement = "character"), + function(x, pattern, replacement) { + jc <- callJStatic("org.apache.spark.sql.functions", + "regexp_replace", + x@jc, pattern, replacement) + column(jc) + }) + +#' rpad +#' +#' Right-padded with pad to a length of len. +#' +#' @family string_funcs +#' @rdname rpad +#' @name rpad +#' @export +setMethod("rpad", signature(x = "Column", len = "numeric", pad = "character"), + function(x, len, pad) { + jc <- callJStatic("org.apache.spark.sql.functions", + "rpad", + x@jc, as.integer(len), pad) + column(jc) + }) + +#' substring_index +#' +#' Returns the substring from string str before count occurrences of the delimiter delim. +#' If count is positive, everything the left of the final delimiter (counting from left) is +#' returned. If count is negative, every to the right of the final delimiter (counting from the +#' right) is returned. substring <- index performs a case-sensitive match when searching for delim. +#' +#' @family string_funcs +#' @rdname substring_index +#' @name substring_index +#' @export +setMethod("substring_index", + signature(x = "Column", delim = "character", count = "numeric"), + function(x, delim, count) { + jc <- callJStatic("org.apache.spark.sql.functions", + "substring_index", + x@jc, delim, as.integer(count)) + column(jc) + }) + +#' translate +#' +#' Translate any character in the src by a character in replaceString. +#' The characters in replaceString is corresponding to the characters in matchingString. +#' The translate will happen when any character in the string matching with the character +#' in the matchingString. +#' +#' @family string_funcs +#' @rdname translate +#' @name translate +#' @export +setMethod("translate", + signature(x = "Column", matchingString = "character", replaceString = "character"), + function(x, matchingString, replaceString) { + jc <- callJStatic("org.apache.spark.sql.functions", + "translate", x@jc, matchingString, replaceString) + column(jc) + }) + +#' unix_timestamp +#' +#' Gets current Unix timestamp in seconds. +#' +#' @family datetime_funcs +#' @rdname unix_timestamp +#' @name unix_timestamp +#' @export +setMethod("unix_timestamp", signature(x = "missing", format = "missing"), + function(x, format) { + jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp") + column(jc) + }) +#' @family datetime_funcs +#' @rdname unix_timestamp +#' @name unix_timestamp +#' @export +setMethod("unix_timestamp", signature(x = "Column", format = "missing"), + function(x, format) { + jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc) + column(jc) + }) +#' @family datetime_funcs +#' @rdname unix_timestamp +#' @name unix_timestamp +#' @export +setMethod("unix_timestamp", signature(x = "Column", format = "character"), + function(x, format = "yyyy-MM-dd HH:mm:ss") { + jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc, format) + column(jc) + }) +#' when +#' +#' Evaluates a list of conditions and returns one of multiple possible result expressions. +#' For unmatched expressions null is returned. +#' +#' @family normal_funcs +#' @rdname when +#' @name when +#' @export +setMethod("when", signature(condition = "Column", value = "ANY"), + function(condition, value) { + condition <- condition@jc + value <- ifelse(class(value) == "Column", value@jc, value) + jc <- callJStatic("org.apache.spark.sql.functions", "when", condition, value) + column(jc) + }) + +#' ifelse +#' +#' Evaluates a list of conditions and returns \code{yes} if the conditions are satisfied. +#' Otherwise \code{no} is returned for unmatched conditions. +#' +#' @family normal_funcs +#' @rdname ifelse +#' @name ifelse +#' @export +setMethod("ifelse", + signature(test = "Column", yes = "ANY", no = "ANY"), + function(test, yes, no) { + test <- test@jc + yes <- ifelse(class(yes) == "Column", yes@jc, yes) + no <- ifelse(class(no) == "Column", no@jc, no) + jc <- callJMethod(callJStatic("org.apache.spark.sql.functions", + "when", + test, yes), + "otherwise", no) + column(jc) + }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index c43b947129e8..43dd8d283ab6 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -413,7 +413,7 @@ setGeneric("dropna", #' @rdname nafunctions #' @export setGeneric("na.omit", - function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) { + function(object, ...) { standardGeneric("na.omit") }) @@ -441,7 +441,7 @@ setGeneric("filter", function(x, condition) { standardGeneric("filter") }) #' @export setGeneric("group_by", function(x, ...) { standardGeneric("group_by") }) -#' @rdname DataFrame +#' @rdname groupBy #' @export setGeneric("groupBy", function(x, ...) { standardGeneric("groupBy") }) @@ -467,7 +467,7 @@ setGeneric("merge") #' @rdname withColumn #' @export -setGeneric("mutate", function(x, ...) {standardGeneric("mutate") }) +setGeneric("mutate", function(.data, ...) {standardGeneric("mutate") }) #' @rdname arrange #' @export @@ -507,6 +507,10 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) { standardGeneric("saveAsTable") }) +#' @rdname withColumn +#' @export +setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") }) + #' @rdname write.df #' @export setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") }) @@ -531,12 +535,16 @@ setGeneric("selectExpr", function(x, expr, ...) { standardGeneric("selectExpr") #' @export setGeneric("showDF", function(x,...) { standardGeneric("showDF") }) +# @rdname subset +# @export +setGeneric("subset", function(x, subset, select, ...) { standardGeneric("subset") }) + #' @rdname agg #' @export setGeneric("summarize", function(x,...) { standardGeneric("summarize") }) -##' rdname summary -##' @export +#' @rdname summary +#' @export setGeneric("summary", function(x, ...) { standardGeneric("summary") }) # @rdname tojson @@ -569,111 +577,405 @@ setGeneric("withColumnRenamed", #' @rdname column #' @export -setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCountDistinct") }) +setGeneric("asc", function(x) { standardGeneric("asc") }) #' @rdname column #' @export -setGeneric("asc", function(x) { standardGeneric("asc") }) +setGeneric("between", function(x, bounds) { standardGeneric("between") }) #' @rdname column #' @export -setGeneric("avg", function(x, ...) { standardGeneric("avg") }) +setGeneric("cast", function(x, dataType) { standardGeneric("cast") }) #' @rdname column #' @export -setGeneric("between", function(x, bounds) { standardGeneric("between") }) +setGeneric("contains", function(x, ...) { standardGeneric("contains") }) #' @rdname column #' @export -setGeneric("cast", function(x, dataType) { standardGeneric("cast") }) +setGeneric("desc", function(x) { standardGeneric("desc") }) #' @rdname column #' @export -setGeneric("cbrt", function(x) { standardGeneric("cbrt") }) +setGeneric("endsWith", function(x, ...) { standardGeneric("endsWith") }) #' @rdname column #' @export -setGeneric("contains", function(x, ...) { standardGeneric("contains") }) +setGeneric("getField", function(x, ...) { standardGeneric("getField") }) + #' @rdname column #' @export -setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct") }) +setGeneric("getItem", function(x, ...) { standardGeneric("getItem") }) #' @rdname column #' @export -setGeneric("desc", function(x) { standardGeneric("desc") }) +setGeneric("isNull", function(x) { standardGeneric("isNull") }) #' @rdname column #' @export -setGeneric("endsWith", function(x, ...) { standardGeneric("endsWith") }) +setGeneric("isNotNull", function(x) { standardGeneric("isNotNull") }) #' @rdname column #' @export -setGeneric("getField", function(x, ...) { standardGeneric("getField") }) +setGeneric("like", function(x, ...) { standardGeneric("like") }) #' @rdname column #' @export -setGeneric("getItem", function(x, ...) { standardGeneric("getItem") }) +setGeneric("rlike", function(x, ...) { standardGeneric("rlike") }) #' @rdname column #' @export -setGeneric("hypot", function(y, x) { standardGeneric("hypot") }) +setGeneric("startsWith", function(x, ...) { standardGeneric("startsWith") }) #' @rdname column #' @export -setGeneric("isNull", function(x) { standardGeneric("isNull") }) +setGeneric("when", function(condition, value) { standardGeneric("when") }) #' @rdname column #' @export -setGeneric("isNotNull", function(x) { standardGeneric("isNotNull") }) +setGeneric("otherwise", function(x, value) { standardGeneric("otherwise") }) -#' @rdname column + +###################### Expression Function Methods ########################## + +#' @rdname add_months +#' @export +setGeneric("add_months", function(y, x) { standardGeneric("add_months") }) + +#' @rdname approxCountDistinct +#' @export +setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCountDistinct") }) + +#' @rdname ascii +#' @export +setGeneric("ascii", function(x) { standardGeneric("ascii") }) + +#' @rdname avg +#' @export +setGeneric("avg", function(x, ...) { standardGeneric("avg") }) + +#' @rdname base64 +#' @export +setGeneric("base64", function(x) { standardGeneric("base64") }) + +#' @rdname bin +#' @export +setGeneric("bin", function(x) { standardGeneric("bin") }) + +#' @rdname bitwiseNOT +#' @export +setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") }) + +#' @rdname cbrt +#' @export +setGeneric("cbrt", function(x) { standardGeneric("cbrt") }) + +#' @rdname ceil +#' @export +setGeneric("ceil", function(x) { standardGeneric("ceil") }) + +#' @rdname concat +#' @export +setGeneric("concat", function(x, ...) { standardGeneric("concat") }) + +#' @rdname concat_ws +#' @export +setGeneric("concat_ws", function(sep, x, ...) { standardGeneric("concat_ws") }) + +#' @rdname conv +#' @export +setGeneric("conv", function(x, fromBase, toBase) { standardGeneric("conv") }) + +#' @rdname countDistinct +#' @export +setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct") }) + +#' @rdname crc32 +#' @export +setGeneric("crc32", function(x) { standardGeneric("crc32") }) + +#' @rdname datediff +#' @export +setGeneric("datediff", function(y, x) { standardGeneric("datediff") }) + +#' @rdname date_add +#' @export +setGeneric("date_add", function(y, x) { standardGeneric("date_add") }) + +#' @rdname date_format +#' @export +setGeneric("date_format", function(y, x) { standardGeneric("date_format") }) + +#' @rdname date_sub +#' @export +setGeneric("date_sub", function(y, x) { standardGeneric("date_sub") }) + +#' @rdname dayofmonth +#' @export +setGeneric("dayofmonth", function(x) { standardGeneric("dayofmonth") }) + +#' @rdname dayofyear +#' @export +setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") }) + +#' @rdname explode +#' @export +setGeneric("explode", function(x) { standardGeneric("explode") }) + +#' @rdname expr +#' @export +setGeneric("expr", function(x) { standardGeneric("expr") }) + +#' @rdname from_utc_timestamp +#' @export +setGeneric("from_utc_timestamp", function(y, x) { standardGeneric("from_utc_timestamp") }) + +#' @rdname format_number +#' @export +setGeneric("format_number", function(y, x) { standardGeneric("format_number") }) + +#' @rdname format_string +#' @export +setGeneric("format_string", function(format, x, ...) { standardGeneric("format_string") }) + +#' @rdname from_unixtime +#' @export +setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") }) + +#' @rdname greatest +#' @export +setGeneric("greatest", function(x, ...) { standardGeneric("greatest") }) + +#' @rdname hex +#' @export +setGeneric("hex", function(x) { standardGeneric("hex") }) + +#' @rdname hour +#' @export +setGeneric("hour", function(x) { standardGeneric("hour") }) + +#' @rdname hypot +#' @export +setGeneric("hypot", function(y, x) { standardGeneric("hypot") }) + +#' @rdname initcap +#' @export +setGeneric("initcap", function(x) { standardGeneric("initcap") }) + +#' @rdname instr +#' @export +setGeneric("instr", function(y, x) { standardGeneric("instr") }) + +#' @rdname isNaN +#' @export +setGeneric("isNaN", function(x) { standardGeneric("isNaN") }) + +#' @rdname last #' @export setGeneric("last", function(x) { standardGeneric("last") }) -#' @rdname column +#' @rdname last_day #' @export -setGeneric("like", function(x, ...) { standardGeneric("like") }) +setGeneric("last_day", function(x) { standardGeneric("last_day") }) -#' @rdname column +#' @rdname least +#' @export +setGeneric("least", function(x, ...) { standardGeneric("least") }) + +#' @rdname levenshtein +#' @export +setGeneric("levenshtein", function(y, x) { standardGeneric("levenshtein") }) + +#' @rdname lit +#' @export +setGeneric("lit", function(x) { standardGeneric("lit") }) + +#' @rdname locate +#' @export +setGeneric("locate", function(substr, str, ...) { standardGeneric("locate") }) + +#' @rdname lower #' @export setGeneric("lower", function(x) { standardGeneric("lower") }) -#' @rdname column +#' @rdname lpad +#' @export +setGeneric("lpad", function(x, len, pad) { standardGeneric("lpad") }) + +#' @rdname ltrim +#' @export +setGeneric("ltrim", function(x) { standardGeneric("ltrim") }) + +#' @rdname md5 +#' @export +setGeneric("md5", function(x) { standardGeneric("md5") }) + +#' @rdname minute +#' @export +setGeneric("minute", function(x) { standardGeneric("minute") }) + +#' @rdname month +#' @export +setGeneric("month", function(x) { standardGeneric("month") }) + +#' @rdname months_between +#' @export +setGeneric("months_between", function(y, x) { standardGeneric("months_between") }) + +#' @rdname count #' @export setGeneric("n", function(x) { standardGeneric("n") }) -#' @rdname column +#' @rdname nanvl +#' @export +setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") }) + +#' @rdname negate +#' @export +setGeneric("negate", function(x) { standardGeneric("negate") }) + +#' @rdname next_day +#' @export +setGeneric("next_day", function(y, x) { standardGeneric("next_day") }) + +#' @rdname countDistinct #' @export setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") }) -#' @rdname column +#' @rdname pmod +#' @export +setGeneric("pmod", function(y, x) { standardGeneric("pmod") }) + +#' @rdname quarter +#' @export +setGeneric("quarter", function(x) { standardGeneric("quarter") }) + +#' @rdname rand +#' @export +setGeneric("rand", function(seed) { standardGeneric("rand") }) + +#' @rdname randn +#' @export +setGeneric("randn", function(seed) { standardGeneric("randn") }) + +#' @rdname regexp_extract +#' @export +setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp_extract") }) + +#' @rdname regexp_replace +#' @export +setGeneric("regexp_replace", + function(x, pattern, replacement) { standardGeneric("regexp_replace") }) + +#' @rdname reverse +#' @export +setGeneric("reverse", function(x) { standardGeneric("reverse") }) + +#' @rdname rint #' @export setGeneric("rint", function(x, ...) { standardGeneric("rint") }) -#' @rdname column +#' @rdname rpad #' @export -setGeneric("rlike", function(x, ...) { standardGeneric("rlike") }) +setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") }) -#' @rdname column +#' @rdname rtrim #' @export -setGeneric("startsWith", function(x, ...) { standardGeneric("startsWith") }) +setGeneric("rtrim", function(x) { standardGeneric("rtrim") }) -#' @rdname column +#' @rdname second +#' @export +setGeneric("second", function(x) { standardGeneric("second") }) + +#' @rdname sha1 +#' @export +setGeneric("sha1", function(x) { standardGeneric("sha1") }) + +#' @rdname sha2 +#' @export +setGeneric("sha2", function(y, x) { standardGeneric("sha2") }) + +#' @rdname shiftLeft +#' @export +setGeneric("shiftLeft", function(y, x) { standardGeneric("shiftLeft") }) + +#' @rdname shiftRight +#' @export +setGeneric("shiftRight", function(y, x) { standardGeneric("shiftRight") }) + +#' @rdname shiftRightUnsigned +#' @export +setGeneric("shiftRightUnsigned", function(y, x) { standardGeneric("shiftRightUnsigned") }) + +#' @rdname signum +#' @export +setGeneric("signum", function(x) { standardGeneric("signum") }) + +#' @rdname size +#' @export +setGeneric("size", function(x) { standardGeneric("size") }) + +#' @rdname soundex +#' @export +setGeneric("soundex", function(x) { standardGeneric("soundex") }) + +#' @rdname substring_index +#' @export +setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") }) + +#' @rdname sumDistinct #' @export setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") }) -#' @rdname column +#' @rdname toDegrees #' @export setGeneric("toDegrees", function(x) { standardGeneric("toDegrees") }) -#' @rdname column +#' @rdname toRadians #' @export setGeneric("toRadians", function(x) { standardGeneric("toRadians") }) -#' @rdname column +#' @rdname to_date +#' @export +setGeneric("to_date", function(x) { standardGeneric("to_date") }) + +#' @rdname to_utc_timestamp +#' @export +setGeneric("to_utc_timestamp", function(y, x) { standardGeneric("to_utc_timestamp") }) + +#' @rdname translate +#' @export +setGeneric("translate", function(x, matchingString, replaceString) { standardGeneric("translate") }) + +#' @rdname trim +#' @export +setGeneric("trim", function(x) { standardGeneric("trim") }) + +#' @rdname unbase64 +#' @export +setGeneric("unbase64", function(x) { standardGeneric("unbase64") }) + +#' @rdname unhex +#' @export +setGeneric("unhex", function(x) { standardGeneric("unhex") }) + +#' @rdname unix_timestamp +#' @export +setGeneric("unix_timestamp", function(x, format) { standardGeneric("unix_timestamp") }) + +#' @rdname upper #' @export setGeneric("upper", function(x) { standardGeneric("upper") }) +#' @rdname weekofyear +#' @export +setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") }) + +#' @rdname year +#' @export +setGeneric("year", function(x) { standardGeneric("year") }) + + #' @rdname glm #' @export setGeneric("glm") diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index b524d1fd8749..cea3d760d05f 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -56,10 +56,10 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFram #' #' Makes predictions from a model produced by glm(), similarly to R's predict(). #' -#' @param model A fitted MLlib model +#' @param object A fitted MLlib model #' @param newData DataFrame for testing #' @return DataFrame containing predicted values -#' @rdname glm +#' @rdname predict #' @export #' @examples #'\dontrun{ @@ -76,10 +76,10 @@ setMethod("predict", signature(object = "PipelineModel"), #' #' Returns the summary of a model produced by glm(), similarly to R's summary(). #' -#' @param model A fitted MLlib model +#' @param x A fitted MLlib model #' @return a list with a 'coefficient' component, which is the matrix of coefficients. See #' summary.glm for more information. -#' @rdname glm +#' @rdname summary #' @export #' @examples #'\dontrun{ diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index e83104f11642..3c57a44db257 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -160,7 +160,7 @@ sparkR.init <- function( }) if (nchar(sparkHome) != 0) { - sparkHome <- normalizePath(sparkHome) + sparkHome <- suppressWarnings(normalizePath(sparkHome)) } sparkEnvirMap <- new.env() diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 4f9f4d9cad2a..3babcb519378 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -314,7 +314,8 @@ convertEnvsToList <- function(keys, vals) { # Utility function to capture the varargs into environment object varargsToEnv <- function(...) { - pairs <- as.list(substitute(list(...)))[-1L] + # Based on http://stackoverflow.com/a/3057419/4577954 + pairs <- list(...) env <- new.env() for (name in names(pairs)) { env[[name]] <- pairs[[name]] diff --git a/R/pkg/inst/tests/packageInAJarTest.R b/R/pkg/inst/tests/packageInAJarTest.R new file mode 100644 index 000000000000..207a37a0cb47 --- /dev/null +++ b/R/pkg/inst/tests/packageInAJarTest.R @@ -0,0 +1,30 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +library(SparkR) +library(sparkPackageTest) + +sc <- sparkR.init() + +run1 <- myfunc(5L) + +run2 <- myfunc(-4L) + +sparkR.stop() + +if(run1 != 6) quit(save = "no", status = 1) + +if(run2 != -3) quit(save = "no", status = 1) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 7377fc8f1ca9..2062bc72b17b 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -408,6 +408,14 @@ test_that("collect() returns a data.frame", { expect_equal(names(rdf)[1], "age") expect_equal(nrow(rdf), 3) expect_equal(ncol(rdf), 2) + + # collect() returns data correctly from a DataFrame with 0 row + df0 <- limit(df, 0) + rdf <- collect(df0) + expect_true(is.data.frame(rdf)) + expect_equal(names(rdf)[1], "age") + expect_equal(nrow(rdf), 0) + expect_equal(ncol(rdf), 2) }) test_that("limit() returns DataFrame with the correct number of rows", { @@ -492,6 +500,18 @@ test_that("head() and first() return the correct data", { testFirst <- first(df) expect_equal(nrow(testFirst), 1) + + # head() and first() return the correct data on + # a DataFrame with 0 row + df0 <- limit(df, 0) + + testHead <- head(df0) + expect_equal(nrow(testHead), 0) + expect_equal(ncol(testHead), 2) + + testFirst <- first(df0) + expect_equal(nrow(testFirst), 0) + expect_equal(ncol(testFirst), 2) }) test_that("distinct() and unique on DataFrames", { @@ -560,6 +580,42 @@ test_that("select with column", { df2 <- select(df, df$age) expect_equal(columns(df2), c("age")) expect_equal(count(df2), 3) + + df3 <- select(df, lit("x")) + expect_equal(columns(df3), c("x")) + expect_equal(count(df3), 3) + expect_equal(collect(select(df3, "x"))[[1, 1]], "x") +}) + +test_that("subsetting", { + # jsonFile returns columns in random order + df <- select(jsonFile(sqlContext, jsonPath), "name", "age") + filtered <- df[df$age > 20,] + expect_equal(count(filtered), 1) + expect_equal(columns(filtered), c("name", "age")) + expect_equal(collect(filtered)$name, "Andy") + + df2 <- df[df$age == 19, 1] + expect_is(df2, "DataFrame") + expect_equal(count(df2), 1) + expect_equal(columns(df2), c("name")) + expect_equal(collect(df2)$name, "Justin") + + df3 <- df[df$age > 20, 2] + expect_equal(count(df3), 1) + expect_equal(columns(df3), c("age")) + + df4 <- df[df$age %in% c(19, 30), 1:2] + expect_equal(count(df4), 2) + expect_equal(columns(df4), c("name", "age")) + + df5 <- df[df$age %in% c(19), c(1,2)] + expect_equal(count(df5), 1) + expect_equal(columns(df5), c("name", "age")) + + df6 <- subset(df, df$age %in% c(30), c(1,2)) + expect_equal(count(df6), 1) + expect_equal(columns(df6), c("name", "age")) }) test_that("selectExpr() on a DataFrame", { @@ -573,6 +629,11 @@ test_that("selectExpr() on a DataFrame", { expect_equal(count(selected2), 3) }) +test_that("expr() on a DataFrame", { + df <- jsonFile(sqlContext, jsonPath) + expect_equal(collect(select(df, expr("abs(-123)")))[1, 1], 123) +}) + test_that("column calculation", { df <- jsonFile(sqlContext, jsonPath) d <- collect(select(df, alias(df$age + 1, "age2"))) @@ -640,15 +701,17 @@ test_that("column operators", { test_that("column functions", { c <- SparkR:::col("a") - c2 <- min(c) + max(c) + sum(c) + avg(c) + count(c) + abs(c) + sqrt(c) - c3 <- lower(c) + upper(c) + first(c) + last(c) - c4 <- approxCountDistinct(c) + countDistinct(c) + cast(c, "string") - c5 <- n(c) + n_distinct(c) - c5 <- acos(c) + asin(c) + atan(c) + cbrt(c) - c6 <- ceiling(c) + cos(c) + cosh(c) + exp(c) + expm1(c) - c7 <- floor(c) + log(c) + log10(c) + log1p(c) + rint(c) - c8 <- sign(c) + sin(c) + sinh(c) + tan(c) + tanh(c) - c9 <- toDegrees(c) + toRadians(c) + c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c) + c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c) + c3 <- cosh(c) + count(c) + crc32(c) + exp(c) + c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c) + c5 <- hour(c) + initcap(c) + isNaN(c) + last(c) + last_day(c) + length(c) + c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c) + c7 <- mean(c) + min(c) + month(c) + negate(c) + quarter(c) + c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + c9 <- signum(c) + sin(c) + sinh(c) + size(c) + soundex(c) + sqrt(c) + sum(c) + c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c) + c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c) df <- jsonFile(sqlContext, jsonPath) df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20))) @@ -661,8 +724,11 @@ test_that("column functions", { expect_equal(collect(df3)[[1, 1]], TRUE) expect_equal(collect(df3)[[2, 1]], FALSE) expect_equal(collect(df3)[[3, 1]], TRUE) -}) + df4 <- createDataFrame(sqlContext, list(list(a = "010101"))) + expect_equal(collect(select(df4, conv(df4$a, 2, 16)))[1, 1], "15") +}) +# test_that("column binary mathfunctions", { lines <- c("{\"a\":1, \"b\":5}", "{\"a\":2, \"b\":6}", @@ -681,6 +747,13 @@ test_that("column binary mathfunctions", { expect_equal(collect(select(df, hypot(df$a, df$b)))[3, "HYPOT(a, b)"], sqrt(3^2 + 7^2)) expect_equal(collect(select(df, hypot(df$a, df$b)))[4, "HYPOT(a, b)"], sqrt(4^2 + 8^2)) ## nolint end + expect_equal(collect(select(df, shiftLeft(df$b, 1)))[4, 1], 16) + expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4) + expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4) + expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric") + expect_equal(collect(select(df, rand(1)))[1, 1], 0.45, tolerance = 0.01) + expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric") + expect_equal(collect(select(df, randn(1)))[1, 1], -0.0111, tolerance = 0.01) }) test_that("string operators", { @@ -689,6 +762,94 @@ test_that("string operators", { expect_equal(count(where(df, startsWith(df$name, "A"))), 1) expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi") expect_equal(collect(select(df, cast(df$age, "string")))[[2, 1]], "30") + expect_equal(collect(select(df, concat(df$name, lit(":"), df$age)))[[2, 1]], "Andy:30") + expect_equal(collect(select(df, concat_ws(":", df$name)))[[2, 1]], "Andy") + expect_equal(collect(select(df, concat_ws(":", df$name, df$age)))[[2, 1]], "Andy:30") + expect_equal(collect(select(df, instr(df$name, "i")))[, 1], c(2, 0, 5)) + expect_equal(collect(select(df, format_number(df$age, 2)))[2, 1], "30.00") + expect_equal(collect(select(df, sha1(df$name)))[2, 1], + "ab5a000e88b5d9d0fa2575f5c6263eb93452405d") + expect_equal(collect(select(df, sha2(df$name, 256)))[2, 1], + "80f2aed3c618c423ddf05a2891229fba44942d907173152442cf6591441ed6dc") + expect_equal(collect(select(df, format_string("Name:%s", df$name)))[2, 1], "Name:Andy") + expect_equal(collect(select(df, format_string("%s, %d", df$name, df$age)))[2, 1], "Andy, 30") + expect_equal(collect(select(df, regexp_extract(df$name, "(n.y)", 1)))[2, 1], "ndy") + expect_equal(collect(select(df, regexp_replace(df$name, "(n.y)", "ydn")))[2, 1], "Aydn") + + l2 <- list(list(a = "aaads")) + df2 <- createDataFrame(sqlContext, l2) + expect_equal(collect(select(df2, locate("aa", df2$a)))[1, 1], 1) + expect_equal(collect(select(df2, locate("aa", df2$a, 1)))[1, 1], 2) + expect_equal(collect(select(df2, lpad(df2$a, 8, "#")))[1, 1], "###aaads") + expect_equal(collect(select(df2, rpad(df2$a, 8, "#")))[1, 1], "aaads###") + + l3 <- list(list(a = "a.b.c.d")) + df3 <- createDataFrame(sqlContext, l3) + expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b") + expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d") + expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d") +}) + +test_that("date functions on a DataFrame", { + .originalTimeZone <- Sys.getenv("TZ") + Sys.setenv(TZ = "UTC") + l <- list(list(a = 1L, b = as.Date("2012-12-13")), + list(a = 2L, b = as.Date("2013-12-14")), + list(a = 3L, b = as.Date("2014-12-15"))) + df <- createDataFrame(sqlContext, l) + expect_equal(collect(select(df, dayofmonth(df$b)))[, 1], c(13, 14, 15)) + expect_equal(collect(select(df, dayofyear(df$b)))[, 1], c(348, 348, 349)) + expect_equal(collect(select(df, weekofyear(df$b)))[, 1], c(50, 50, 51)) + expect_equal(collect(select(df, year(df$b)))[, 1], c(2012, 2013, 2014)) + expect_equal(collect(select(df, month(df$b)))[, 1], c(12, 12, 12)) + expect_equal(collect(select(df, last_day(df$b)))[, 1], + c(as.Date("2012-12-31"), as.Date("2013-12-31"), as.Date("2014-12-31"))) + expect_equal(collect(select(df, next_day(df$b, "MONDAY")))[, 1], + c(as.Date("2012-12-17"), as.Date("2013-12-16"), as.Date("2014-12-22"))) + expect_equal(collect(select(df, date_format(df$b, "y")))[, 1], c("2012", "2013", "2014")) + expect_equal(collect(select(df, add_months(df$b, 3)))[, 1], + c(as.Date("2013-03-13"), as.Date("2014-03-14"), as.Date("2015-03-15"))) + expect_equal(collect(select(df, date_add(df$b, 1)))[, 1], + c(as.Date("2012-12-14"), as.Date("2013-12-15"), as.Date("2014-12-16"))) + expect_equal(collect(select(df, date_sub(df$b, 1)))[, 1], + c(as.Date("2012-12-12"), as.Date("2013-12-13"), as.Date("2014-12-14"))) + + l2 <- list(list(a = 1L, b = as.POSIXlt("2012-12-13 12:34:00", tz = "UTC")), + list(a = 2L, b = as.POSIXlt("2014-12-15 01:24:34", tz = "UTC"))) + df2 <- createDataFrame(sqlContext, l2) + expect_equal(collect(select(df2, minute(df2$b)))[, 1], c(34, 24)) + expect_equal(collect(select(df2, second(df2$b)))[, 1], c(0, 34)) + expect_equal(collect(select(df2, from_utc_timestamp(df2$b, "JST")))[, 1], + c(as.POSIXlt("2012-12-13 21:34:00 UTC"), as.POSIXlt("2014-12-15 10:24:34 UTC"))) + expect_equal(collect(select(df2, to_utc_timestamp(df2$b, "JST")))[, 1], + c(as.POSIXlt("2012-12-13 03:34:00 UTC"), as.POSIXlt("2014-12-14 16:24:34 UTC"))) + expect_more_than(collect(select(df2, unix_timestamp()))[1, 1], 0) + expect_more_than(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0) + expect_more_than(collect(select(df2, unix_timestamp(lit("2015-01-01"), "yyyy-MM-dd")))[1, 1], 0) + + l3 <- list(list(a = 1000), list(a = -1000)) + df3 <- createDataFrame(sqlContext, l3) + result31 <- collect(select(df3, from_unixtime(df3$a))) + expect_equal(grep("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", result31[, 1], perl = TRUE), + c(1, 2)) + result32 <- collect(select(df3, from_unixtime(df3$a, "yyyy"))) + expect_equal(grep("\\d{4}", result32[, 1]), c(1, 2)) + Sys.setenv(TZ = .originalTimeZone) +}) + +test_that("greatest() and least() on a DataFrame", { + l <- list(list(a = 1, b = 2), list(a = 3, b = 4)) + df <- createDataFrame(sqlContext, l) + expect_equal(collect(select(df, greatest(df$a, df$b)))[, 1], c(2, 4)) + expect_equal(collect(select(df, least(df$a, df$b)))[, 1], c(1, 3)) +}) + +test_that("when(), otherwise() and ifelse() on a DataFrame", { + l <- list(list(a = 1, b = 2), list(a = 3, b = 4)) + df <- createDataFrame(sqlContext, l) + expect_equal(collect(select(df, when(df$a > 1 & df$b > 2, 1)))[, 1], c(NA, 1)) + expect_equal(collect(select(df, otherwise(when(df$a > 1, 1), 0)))[, 1], c(0, 1)) + expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, 0, 1)))[, 1], c(1, 0)) }) test_that("group by", { @@ -871,7 +1032,7 @@ test_that("withColumn() and withColumnRenamed()", { expect_equal(columns(newDF2)[1], "newerAge") }) -test_that("mutate(), rename() and names()", { +test_that("mutate(), transform(), rename() and names()", { df <- jsonFile(sqlContext, jsonPath) newDF <- mutate(df, newAge = df$age + 2) expect_equal(length(columns(newDF)), 3) @@ -885,6 +1046,20 @@ test_that("mutate(), rename() and names()", { names(newDF2) <- c("newerName", "evenNewerAge") expect_equal(length(names(newDF2)), 2) expect_equal(names(newDF2)[1], "newerName") + + transformedDF <- transform(df, newAge = -df$age, newAge2 = df$age / 2) + expect_equal(length(columns(transformedDF)), 4) + expect_equal(columns(transformedDF)[3], "newAge") + expect_equal(columns(transformedDF)[4], "newAge2") + expect_equal(first(filter(transformedDF, transformedDF$name == "Andy"))$newAge, -30) + + # test if transform on local data frames works + # ensure the proper signature is used - otherwise this will fail to run + attach(airquality) + result <- transform(Ozone, logOzone = log(Ozone)) + expect_equal(nrow(result), 153) + expect_equal(ncol(result), 2) + detach(airquality) }) test_that("write.df() on DataFrame and works with parquetFile", { @@ -903,6 +1078,12 @@ test_that("parquetFile works with multiple input paths", { parquetDF <- parquetFile(sqlContext, parquetPath, parquetPath2) expect_is(parquetDF, "DataFrame") expect_equal(count(parquetDF), count(df) * 2) + + # Test if varargs works with variables + saveMode <- "overwrite" + mergeSchema <- "true" + parquetPath3 <- tempfile(pattern = "parquetPath3", fileext = ".parquet") + write.df(df, parquetPath2, "parquet", mode = saveMode, mergeSchema = mergeSchema) }) test_that("describe() and summarize() on a DataFrame", { @@ -920,7 +1101,7 @@ test_that("describe() and summarize() on a DataFrame", { expect_equal(collect(stats2)[5, "age"], "30") }) -test_that("dropna() on a DataFrame", { +test_that("dropna() and na.omit() on a DataFrame", { df <- jsonFile(sqlContext, jsonPathNa) rows <- collect(df) @@ -929,6 +1110,8 @@ test_that("dropna() on a DataFrame", { expected <- rows[!is.na(rows$name),] actual <- collect(dropna(df, cols = "name")) expect_identical(expected, actual) + actual <- collect(na.omit(df, cols = "name")) + expect_identical(expected, actual) expected <- rows[!is.na(rows$age),] actual <- collect(dropna(df, cols = "age")) @@ -938,48 +1121,67 @@ test_that("dropna() on a DataFrame", { expect_identical(expected$age, actual$age) expect_identical(expected$height, actual$height) expect_identical(expected$name, actual$name) + actual <- collect(na.omit(df, cols = "age")) expected <- rows[!is.na(rows$age) & !is.na(rows$height),] actual <- collect(dropna(df, cols = c("age", "height"))) expect_identical(expected, actual) + actual <- collect(na.omit(df, cols = c("age", "height"))) + expect_identical(expected, actual) expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),] actual <- collect(dropna(df)) expect_identical(expected, actual) + actual <- collect(na.omit(df)) + expect_identical(expected, actual) # drop with how expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),] actual <- collect(dropna(df)) expect_identical(expected, actual) + actual <- collect(na.omit(df)) + expect_identical(expected, actual) expected <- rows[!is.na(rows$age) | !is.na(rows$height) | !is.na(rows$name),] actual <- collect(dropna(df, "all")) expect_identical(expected, actual) + actual <- collect(na.omit(df, "all")) + expect_identical(expected, actual) expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),] actual <- collect(dropna(df, "any")) expect_identical(expected, actual) + actual <- collect(na.omit(df, "any")) + expect_identical(expected, actual) expected <- rows[!is.na(rows$age) & !is.na(rows$height),] actual <- collect(dropna(df, "any", cols = c("age", "height"))) expect_identical(expected, actual) + actual <- collect(na.omit(df, "any", cols = c("age", "height"))) + expect_identical(expected, actual) expected <- rows[!is.na(rows$age) | !is.na(rows$height),] actual <- collect(dropna(df, "all", cols = c("age", "height"))) expect_identical(expected, actual) + actual <- collect(na.omit(df, "all", cols = c("age", "height"))) + expect_identical(expected, actual) # drop with threshold expected <- rows[as.integer(!is.na(rows$age)) + as.integer(!is.na(rows$height)) >= 2,] actual <- collect(dropna(df, minNonNulls = 2, cols = c("age", "height"))) expect_identical(expected, actual) + actual <- collect(na.omit(df, minNonNulls = 2, cols = c("age", "height"))) + expect_identical(expected, actual) expected <- rows[as.integer(!is.na(rows$age)) + as.integer(!is.na(rows$height)) + as.integer(!is.na(rows$name)) >= 3,] actual <- collect(dropna(df, minNonNulls = 3, cols = c("name", "age", "height"))) expect_identical(expected, actual) + actual <- collect(na.omit(df, minNonNulls = 3, cols = c("name", "age", "height"))) + expect_identical(expected, actual) }) test_that("fillna() on a DataFrame", { diff --git a/R/run-tests.sh b/R/run-tests.sh index 18a1e13bdc65..e82ad0ba2cd0 100755 --- a/R/run-tests.sh +++ b/R/run-tests.sh @@ -23,7 +23,7 @@ FAILED=0 LOGFILE=$FWDIR/unit-tests.out rm -f $LOGFILE -SPARK_TESTING=1 $FWDIR/../bin/sparkR --conf spark.buffer.pageSize=4m --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE +SPARK_TESTING=1 $FWDIR/../bin/sparkR --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE FAILED=$((PIPESTATUS[0]||$FAILED)) if [[ $FAILED != 0 ]]; then diff --git a/README.md b/README.md index 380422ca00db..76e29b423566 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Apache Spark Spark is a fast and general cluster computing system for Big Data. It provides -high-level APIs in Scala, Java, and Python, and an optimized engine that +high-level APIs in Scala, Java, Python, and R, and an optimized engine that supports general computation graphs for data analysis. It also supports a rich set of higher-level tools including Spark SQL for SQL and DataFrames, MLlib for machine learning, GraphX for graph processing, @@ -94,5 +94,5 @@ distribution. ## Configuration -Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html) +Please refer to the [Configuration Guide](http://spark.apache.org/docs/latest/configuration.html) in the online documentation for an overview on how to configure Spark. diff --git a/assembly/pom.xml b/assembly/pom.xml index e9c6d26ccddc..7b41ebb283ca 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml diff --git a/bagel/pom.xml b/bagel/pom.xml index ed5c37e595a9..16bf17c13541 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml diff --git a/build/mvn b/build/mvn index f62f61ee1c41..ec0380afad31 100755 --- a/build/mvn +++ b/build/mvn @@ -51,11 +51,11 @@ install_app() { # check if we have curl installed # download application [ ! -f "${local_tarball}" ] && [ $(command -v curl) ] && \ - echo "exec: curl ${curl_opts} ${remote_tarball}" && \ + echo "exec: curl ${curl_opts} ${remote_tarball}" 1>&2 && \ curl ${curl_opts} "${remote_tarball}" > "${local_tarball}" # if the file still doesn't exist, lets try `wget` and cross our fingers [ ! -f "${local_tarball}" ] && [ $(command -v wget) ] && \ - echo "exec: wget ${wget_opts} ${remote_tarball}" && \ + echo "exec: wget ${wget_opts} ${remote_tarball}" 1>&2 && \ wget ${wget_opts} -O "${local_tarball}" "${remote_tarball}" # if both were unsuccessful, exit [ ! -f "${local_tarball}" ] && \ @@ -82,7 +82,7 @@ install_mvn() { # Install zinc under the build/ folder install_zinc() { local zinc_path="zinc-0.3.5.3/bin/zinc" - [ ! -f "${zinc_path}" ] && ZINC_INSTALL_FLAG=1 + [ ! -f "${_DIR}/${zinc_path}" ] && ZINC_INSTALL_FLAG=1 install_app \ "http://downloads.typesafe.com/zinc/0.3.5.3" \ "zinc-0.3.5.3.tgz" \ @@ -135,9 +135,9 @@ cd "${_CALLING_DIR}" # Now that zinc is ensured to be installed, check its status and, if its # not running or just installed, start it -if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`${ZINC_BIN} -status`" ]; then +if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`${ZINC_BIN} -status -port ${ZINC_PORT}`" ]; then export ZINC_OPTS=${ZINC_OPTS:-"$_COMPILE_JVM_OPTS"} - ${ZINC_BIN} -shutdown + ${ZINC_BIN} -shutdown -port ${ZINC_PORT} ${ZINC_BIN} -start -port ${ZINC_PORT} \ -scala-compiler "${SCALA_COMPILER}" \ -scala-library "${SCALA_LIBRARY}" &>/dev/null @@ -146,7 +146,7 @@ fi # Set any `mvn` options if not already present export MAVEN_OPTS=${MAVEN_OPTS:-"$_COMPILE_JVM_OPTS"} -echo "Using \`mvn\` from path: $MVN_BIN" +echo "Using \`mvn\` from path: $MVN_BIN" 1>&2 # Last, call the `mvn` command as usual -${MVN_BIN} "$@" +${MVN_BIN} -DzincPort=${ZINC_PORT} "$@" diff --git a/build/sbt-launch-lib.bash b/build/sbt-launch-lib.bash index 7930a38b9674..615f84839465 100755 --- a/build/sbt-launch-lib.bash +++ b/build/sbt-launch-lib.bash @@ -38,8 +38,7 @@ dlog () { acquire_sbt_jar () { SBT_VERSION=`awk -F "=" '/sbt\.version/ {print $2}' ./project/build.properties` - URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar - URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar + URL1=https://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar JAR=build/sbt-launch-${SBT_VERSION}.jar sbt_jar=$JAR @@ -51,12 +50,10 @@ acquire_sbt_jar () { printf "Attempting to fetch sbt\n" JAR_DL="${JAR}.part" if [ $(command -v curl) ]; then - (curl --fail --location --silent ${URL1} > "${JAR_DL}" ||\ - (rm -f "${JAR_DL}" && curl --fail --location --silent ${URL2} > "${JAR_DL}")) &&\ + curl --fail --location --silent ${URL1} > "${JAR_DL}" &&\ mv "${JAR_DL}" "${JAR}" elif [ $(command -v wget) ]; then - (wget --quiet ${URL1} -O "${JAR_DL}" ||\ - (rm -f "${JAR_DL}" && wget --quiet ${URL2} -O "${JAR_DL}")) &&\ + wget --quiet ${URL1} -O "${JAR_DL}" &&\ mv "${JAR_DL}" "${JAR}" else printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" diff --git a/conf/log4j.properties.template b/conf/log4j.properties.template index 27006e45e932..74c5cea94403 100644 --- a/conf/log4j.properties.template +++ b/conf/log4j.properties.template @@ -10,6 +10,8 @@ log4j.logger.org.spark-project.jetty=WARN log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template index 192d3ae09113..c05fe381a36a 100755 --- a/conf/spark-env.sh.template +++ b/conf/spark-env.sh.template @@ -38,6 +38,7 @@ # - SPARK_WORKER_INSTANCES, to set the number of worker processes per node # - SPARK_WORKER_DIR, to set the working directory of worker processes # - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") +# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). # - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") # - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") # - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") diff --git a/core/pom.xml b/core/pom.xml index 202678779150..beb547fa39bd 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml @@ -46,30 +46,10 @@ com.twitter chill_${scala.binary.version} - - - org.ow2.asm - asm - - - org.ow2.asm - asm-commons - - com.twitter chill-java - - - org.ow2.asm - asm - - - org.ow2.asm - asm-commons - - org.apache.hadoop @@ -286,7 +266,7 @@ org.tachyonproject tachyon-client - 0.7.0 + 0.7.1 org.apache.hadoop diff --git a/core/src/main/java/org/apache/spark/api/java/JavaSparkContextVarargsWorkaround.java b/core/src/main/java/org/apache/spark/api/java/JavaSparkContextVarargsWorkaround.java index 2090efd3b999..d4c42b38ac22 100644 --- a/core/src/main/java/org/apache/spark/api/java/JavaSparkContextVarargsWorkaround.java +++ b/core/src/main/java/org/apache/spark/api/java/JavaSparkContextVarargsWorkaround.java @@ -23,11 +23,13 @@ // See // http://scala-programming-language.1934581.n4.nabble.com/Workaround-for-implementing-java-varargs-in-2-7-2-final-tp1944767p1944772.html abstract class JavaSparkContextVarargsWorkaround { - public JavaRDD union(JavaRDD... rdds) { + + @SafeVarargs + public final JavaRDD union(JavaRDD... rdds) { if (rdds.length == 0) { throw new IllegalArgumentException("Union called on empty list"); } - ArrayList> rest = new ArrayList>(rdds.length - 1); + List> rest = new ArrayList<>(rdds.length - 1); for (int i = 1; i < rdds.length; i++) { rest.add(rdds[i]); } @@ -38,18 +40,19 @@ public JavaDoubleRDD union(JavaDoubleRDD... rdds) { if (rdds.length == 0) { throw new IllegalArgumentException("Union called on empty list"); } - ArrayList rest = new ArrayList(rdds.length - 1); + List rest = new ArrayList<>(rdds.length - 1); for (int i = 1; i < rdds.length; i++) { rest.add(rdds[i]); } return union(rdds[0], rest); } - public JavaPairRDD union(JavaPairRDD... rdds) { + @SafeVarargs + public final JavaPairRDD union(JavaPairRDD... rdds) { if (rdds.length == 0) { throw new IllegalArgumentException("Union called on empty list"); } - ArrayList> rest = new ArrayList>(rdds.length - 1); + List> rest = new ArrayList<>(rdds.length - 1); for (int i = 1; i < rdds.length; i++) { rest.add(rdds[i]); } @@ -57,7 +60,7 @@ public JavaPairRDD union(JavaPairRDD... rdds) { } // These methods take separate "first" and "rest" elements to avoid having the same type erasure - abstract public JavaRDD union(JavaRDD first, List> rest); - abstract public JavaDoubleRDD union(JavaDoubleRDD first, List rest); - abstract public JavaPairRDD union(JavaPairRDD first, List> rest); + public abstract JavaRDD union(JavaRDD first, List> rest); + public abstract JavaDoubleRDD union(JavaDoubleRDD first, List rest); + public abstract JavaPairRDD union(JavaPairRDD first, List> rest); } diff --git a/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java index 0399abc63c23..0e58bb4f7101 100644 --- a/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java +++ b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java @@ -25,7 +25,7 @@ import scala.reflect.ClassTag; import org.apache.spark.annotation.Private; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; /** * Unfortunately, we need a serializer instance in order to construct a DiskBlockObjectWriter. @@ -49,7 +49,7 @@ public void flush() { try { s.flush(); } catch (IOException e) { - PlatformDependent.throwException(e); + Platform.throwException(e); } } @@ -64,7 +64,7 @@ public void close() { try { s.close(); } catch (IOException e) { - PlatformDependent.throwException(e); + Platform.throwException(e); } } }; diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java index 1aa6ba420126..e73ba3946882 100644 --- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java +++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java @@ -17,6 +17,7 @@ package org.apache.spark.shuffle.unsafe; +import javax.annotation.Nullable; import java.io.File; import java.io.IOException; import java.util.LinkedList; @@ -33,8 +34,11 @@ import org.apache.spark.serializer.DummySerializerInstance; import org.apache.spark.serializer.SerializerInstance; import org.apache.spark.shuffle.ShuffleMemoryManager; -import org.apache.spark.storage.*; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.storage.BlockManager; +import org.apache.spark.storage.DiskBlockObjectWriter; +import org.apache.spark.storage.TempShuffleBlockId; +import org.apache.spark.unsafe.Platform; +import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.memory.MemoryBlock; import org.apache.spark.unsafe.memory.TaskMemoryManager; import org.apache.spark.util.Utils; @@ -67,7 +71,7 @@ final class UnsafeShuffleExternalSorter { private final int pageSizeBytes; @VisibleForTesting final int maxRecordSizeBytes; - private final TaskMemoryManager memoryManager; + private final TaskMemoryManager taskMemoryManager; private final ShuffleMemoryManager shuffleMemoryManager; private final BlockManager blockManager; private final TaskContext taskContext; @@ -86,9 +90,12 @@ final class UnsafeShuffleExternalSorter { private final LinkedList spills = new LinkedList(); + /** Peak memory used by this sorter so far, in bytes. **/ + private long peakMemoryUsedBytes; + // These variables are reset after spilling: - private UnsafeShuffleInMemorySorter sorter; - private MemoryBlock currentPage = null; + @Nullable private UnsafeShuffleInMemorySorter inMemSorter; + @Nullable private MemoryBlock currentPage = null; private long currentPagePosition = -1; private long freeSpaceInCurrentPage = 0; @@ -101,20 +108,24 @@ public UnsafeShuffleExternalSorter( int numPartitions, SparkConf conf, ShuffleWriteMetrics writeMetrics) throws IOException { - this.memoryManager = memoryManager; + this.taskMemoryManager = memoryManager; this.shuffleMemoryManager = shuffleMemoryManager; this.blockManager = blockManager; this.taskContext = taskContext; this.initialSize = initialSize; + this.peakMemoryUsedBytes = initialSize; this.numPartitions = numPartitions; // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024; this.pageSizeBytes = (int) Math.min( - PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES, - conf.getSizeAsBytes("spark.buffer.pageSize", "64m")); + PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES, shuffleMemoryManager.pageSizeBytes()); this.maxRecordSizeBytes = pageSizeBytes - 4; this.writeMetrics = writeMetrics; initializeForWriting(); + + // preserve first page to ensure that we have at least one page to work with. Otherwise, + // other operators in the same task may starve this sorter (SPARK-9709). + acquireNewPageIfNecessary(pageSizeBytes); } /** @@ -129,7 +140,7 @@ private void initializeForWriting() throws IOException { throw new IOException("Could not acquire " + memoryRequested + " bytes of memory"); } - this.sorter = new UnsafeShuffleInMemorySorter(initialSize); + this.inMemSorter = new UnsafeShuffleInMemorySorter(initialSize); } /** @@ -156,7 +167,7 @@ private void writeSortedFile(boolean isLastFile) throws IOException { // This call performs the actual sort. final UnsafeShuffleInMemorySorter.UnsafeShuffleSorterIterator sortedRecords = - sorter.getSortedIterator(); + inMemSorter.getSortedIterator(); // Currently, we need to open a new DiskBlockObjectWriter for each partition; we can avoid this // after SPARK-5581 is fixed. @@ -202,18 +213,14 @@ private void writeSortedFile(boolean isLastFile) throws IOException { } final long recordPointer = sortedRecords.packedRecordPointer.getRecordPointer(); - final Object recordPage = memoryManager.getPage(recordPointer); - final long recordOffsetInPage = memoryManager.getOffsetInPage(recordPointer); - int dataRemaining = PlatformDependent.UNSAFE.getInt(recordPage, recordOffsetInPage); + final Object recordPage = taskMemoryManager.getPage(recordPointer); + final long recordOffsetInPage = taskMemoryManager.getOffsetInPage(recordPointer); + int dataRemaining = Platform.getInt(recordPage, recordOffsetInPage); long recordReadPosition = recordOffsetInPage + 4; // skip over record length while (dataRemaining > 0) { final int toTransfer = Math.min(DISK_WRITE_BUFFER_SIZE, dataRemaining); - PlatformDependent.copyMemory( - recordPage, - recordReadPosition, - writeBuffer, - PlatformDependent.BYTE_ARRAY_OFFSET, - toTransfer); + Platform.copyMemory( + recordPage, recordReadPosition, writeBuffer, Platform.BYTE_ARRAY_OFFSET, toTransfer); writer.write(writeBuffer, 0, toTransfer); recordReadPosition += toTransfer; dataRemaining -= toTransfer; @@ -265,9 +272,9 @@ void spill() throws IOException { spills.size() > 1 ? " times" : " time"); writeSortedFile(false); - final long sorterMemoryUsage = sorter.getMemoryUsage(); - sorter = null; - shuffleMemoryManager.release(sorterMemoryUsage); + final long inMemSorterMemoryUsage = inMemSorter.getMemoryUsage(); + inMemSorter = null; + shuffleMemoryManager.release(inMemSorterMemoryUsage); final long spillSize = freeMemory(); taskContext.taskMetrics().incMemoryBytesSpilled(spillSize); @@ -279,13 +286,29 @@ private long getMemoryUsage() { for (MemoryBlock page : allocatedPages) { totalPageSize += page.size(); } - return sorter.getMemoryUsage() + totalPageSize; + return ((inMemSorter == null) ? 0 : inMemSorter.getMemoryUsage()) + totalPageSize; + } + + private void updatePeakMemoryUsed() { + long mem = getMemoryUsage(); + if (mem > peakMemoryUsedBytes) { + peakMemoryUsedBytes = mem; + } + } + + /** + * Return the peak memory used so far, in bytes. + */ + long getPeakMemoryUsedBytes() { + updatePeakMemoryUsed(); + return peakMemoryUsedBytes; } private long freeMemory() { + updatePeakMemoryUsed(); long memoryFreed = 0; for (MemoryBlock block : allocatedPages) { - memoryManager.freePage(block); + taskMemoryManager.freePage(block); shuffleMemoryManager.release(block.size()); memoryFreed += block.size(); } @@ -299,54 +322,53 @@ private long freeMemory() { /** * Force all memory and spill files to be deleted; called by shuffle error-handling code. */ - public void cleanupAfterError() { + public void cleanupResources() { freeMemory(); for (SpillInfo spill : spills) { if (spill.file.exists() && !spill.file.delete()) { logger.error("Unable to delete spill file {}", spill.file.getPath()); } } - if (sorter != null) { - shuffleMemoryManager.release(sorter.getMemoryUsage()); - sorter = null; + if (inMemSorter != null) { + shuffleMemoryManager.release(inMemSorter.getMemoryUsage()); + inMemSorter = null; } } /** - * Checks whether there is enough space to insert a new record into the sorter. - * - * @param requiredSpace the required space in the data page, in bytes, including space for storing - * the record size. - - * @return true if the record can be inserted without requiring more allocations, false otherwise. + * Checks whether there is enough space to insert an additional record in to the sort pointer + * array and grows the array if additional space is required. If the required space cannot be + * obtained, then the in-memory data will be spilled to disk. */ - private boolean haveSpaceForRecord(int requiredSpace) { - assert (requiredSpace > 0); - return (sorter.hasSpaceForAnotherRecord() && (requiredSpace <= freeSpaceInCurrentPage)); - } - - /** - * Allocates more memory in order to insert an additional record. This will request additional - * memory from the {@link ShuffleMemoryManager} and spill if the requested memory can not be - * obtained. - * - * @param requiredSpace the required space in the data page, in bytes, including space for storing - * the record size. - */ - private void allocateSpaceForRecord(int requiredSpace) throws IOException { - if (!sorter.hasSpaceForAnotherRecord()) { + private void growPointerArrayIfNecessary() throws IOException { + assert(inMemSorter != null); + if (!inMemSorter.hasSpaceForAnotherRecord()) { logger.debug("Attempting to expand sort pointer array"); - final long oldPointerArrayMemoryUsage = sorter.getMemoryUsage(); + final long oldPointerArrayMemoryUsage = inMemSorter.getMemoryUsage(); final long memoryToGrowPointerArray = oldPointerArrayMemoryUsage * 2; final long memoryAcquired = shuffleMemoryManager.tryToAcquire(memoryToGrowPointerArray); if (memoryAcquired < memoryToGrowPointerArray) { shuffleMemoryManager.release(memoryAcquired); spill(); } else { - sorter.expandPointerArray(); + inMemSorter.expandPointerArray(); shuffleMemoryManager.release(oldPointerArrayMemoryUsage); } } + } + + /** + * Allocates more memory in order to insert an additional record. This will request additional + * memory from the {@link ShuffleMemoryManager} and spill if the requested memory can not be + * obtained. + * + * @param requiredSpace the required space in the data page, in bytes, including space for storing + * the record size. This must be less than or equal to the page size (records + * that exceed the page size are handled via a different code path which uses + * special overflow pages). + */ + private void acquireNewPageIfNecessary(int requiredSpace) throws IOException { + growPointerArrayIfNecessary(); if (requiredSpace > freeSpaceInCurrentPage) { logger.trace("Required space {} is less than free space in current page ({})", requiredSpace, freeSpaceInCurrentPage); @@ -367,7 +389,7 @@ private void allocateSpaceForRecord(int requiredSpace) throws IOException { throw new IOException("Unable to acquire " + pageSizeBytes + " bytes of memory"); } } - currentPage = memoryManager.allocatePage(pageSizeBytes); + currentPage = taskMemoryManager.allocatePage(pageSizeBytes); currentPagePosition = currentPage.getBaseOffset(); freeSpaceInCurrentPage = pageSizeBytes; allocatedPages.add(currentPage); @@ -383,27 +405,54 @@ public void insertRecord( long recordBaseOffset, int lengthInBytes, int partitionId) throws IOException { + + growPointerArrayIfNecessary(); // Need 4 bytes to store the record length. final int totalSpaceRequired = lengthInBytes + 4; - if (!haveSpaceForRecord(totalSpaceRequired)) { - allocateSpaceForRecord(totalSpaceRequired); + + // --- Figure out where to insert the new record ---------------------------------------------- + + final MemoryBlock dataPage; + long dataPagePosition; + boolean useOverflowPage = totalSpaceRequired > pageSizeBytes; + if (useOverflowPage) { + long overflowPageSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(totalSpaceRequired); + // The record is larger than the page size, so allocate a special overflow page just to hold + // that record. + final long memoryGranted = shuffleMemoryManager.tryToAcquire(overflowPageSize); + if (memoryGranted != overflowPageSize) { + shuffleMemoryManager.release(memoryGranted); + spill(); + final long memoryGrantedAfterSpill = shuffleMemoryManager.tryToAcquire(overflowPageSize); + if (memoryGrantedAfterSpill != overflowPageSize) { + shuffleMemoryManager.release(memoryGrantedAfterSpill); + throw new IOException("Unable to acquire " + overflowPageSize + " bytes of memory"); + } + } + MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize); + allocatedPages.add(overflowPage); + dataPage = overflowPage; + dataPagePosition = overflowPage.getBaseOffset(); + } else { + // The record is small enough to fit in a regular data page, but the current page might not + // have enough space to hold it (or no pages have been allocated yet). + acquireNewPageIfNecessary(totalSpaceRequired); + dataPage = currentPage; + dataPagePosition = currentPagePosition; + // Update bookkeeping information + freeSpaceInCurrentPage -= totalSpaceRequired; + currentPagePosition += totalSpaceRequired; } + final Object dataPageBaseObject = dataPage.getBaseObject(); final long recordAddress = - memoryManager.encodePageNumberAndOffset(currentPage, currentPagePosition); - final Object dataPageBaseObject = currentPage.getBaseObject(); - PlatformDependent.UNSAFE.putInt(dataPageBaseObject, currentPagePosition, lengthInBytes); - currentPagePosition += 4; - freeSpaceInCurrentPage -= 4; - PlatformDependent.copyMemory( - recordBaseObject, - recordBaseOffset, - dataPageBaseObject, - currentPagePosition, - lengthInBytes); - currentPagePosition += lengthInBytes; - freeSpaceInCurrentPage -= lengthInBytes; - sorter.insertRecord(recordAddress, partitionId); + taskMemoryManager.encodePageNumberAndOffset(dataPage, dataPagePosition); + Platform.putInt(dataPageBaseObject, dataPagePosition, lengthInBytes); + dataPagePosition += 4; + Platform.copyMemory( + recordBaseObject, recordBaseOffset, dataPageBaseObject, dataPagePosition, lengthInBytes); + assert(inMemSorter != null); + inMemSorter.insertRecord(recordAddress, partitionId); } /** @@ -415,14 +464,14 @@ public void insertRecord( */ public SpillInfo[] closeAndGetSpills() throws IOException { try { - if (sorter != null) { + if (inMemSorter != null) { // Do not count the final file towards the spill count. writeSortedFile(true); freeMemory(); } return spills.toArray(new SpillInfo[spills.size()]); } catch (IOException e) { - cleanupAfterError(); + cleanupResources(); throw e; } } diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java index d47d6fc9c2ac..2389c28b2839 100644 --- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java @@ -17,14 +17,15 @@ package org.apache.spark.shuffle.unsafe; +import javax.annotation.Nullable; import java.io.*; import java.nio.channels.FileChannel; import java.util.Iterator; -import javax.annotation.Nullable; import scala.Option; import scala.Product2; import scala.collection.JavaConversions; +import scala.collection.immutable.Map; import scala.reflect.ClassTag; import scala.reflect.ClassTag$; @@ -37,10 +38,10 @@ import org.apache.spark.*; import org.apache.spark.annotation.Private; +import org.apache.spark.executor.ShuffleWriteMetrics; import org.apache.spark.io.CompressionCodec; import org.apache.spark.io.CompressionCodec$; import org.apache.spark.io.LZFCompressionCodec; -import org.apache.spark.executor.ShuffleWriteMetrics; import org.apache.spark.network.util.LimitedInputStream; import org.apache.spark.scheduler.MapStatus; import org.apache.spark.scheduler.MapStatus$; @@ -52,7 +53,7 @@ import org.apache.spark.shuffle.ShuffleWriter; import org.apache.spark.storage.BlockManager; import org.apache.spark.storage.TimeTrackingOutputStream; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.memory.TaskMemoryManager; @Private @@ -78,8 +79,9 @@ public class UnsafeShuffleWriter extends ShuffleWriter { private final SparkConf sparkConf; private final boolean transferToEnabled; - private MapStatus mapStatus = null; - private UnsafeShuffleExternalSorter sorter = null; + @Nullable private MapStatus mapStatus; + @Nullable private UnsafeShuffleExternalSorter sorter; + private long peakMemoryUsedBytes = 0; /** Subclass of ByteArrayOutputStream that exposes `buf` directly. */ private static final class MyByteArrayOutputStream extends ByteArrayOutputStream { @@ -131,9 +133,28 @@ public UnsafeShuffleWriter( @VisibleForTesting public int maxRecordSizeBytes() { + assert(sorter != null); return sorter.maxRecordSizeBytes; } + private void updatePeakMemoryUsed() { + // sorter can be null if this writer is closed + if (sorter != null) { + long mem = sorter.getPeakMemoryUsedBytes(); + if (mem > peakMemoryUsedBytes) { + peakMemoryUsedBytes = mem; + } + } + } + + /** + * Return the peak memory used so far, in bytes. + */ + public long getPeakMemoryUsedBytes() { + updatePeakMemoryUsed(); + return peakMemoryUsedBytes; + } + /** * This convenience method should only be called in test code. */ @@ -144,7 +165,7 @@ public void write(Iterator> records) throws IOException { @Override public void write(scala.collection.Iterator> records) throws IOException { - // Keep track of success so we know if we ecountered an exception + // Keep track of success so we know if we encountered an exception // We do this rather than a standard try/catch/re-throw to handle // generic throwables. boolean success = false; @@ -157,7 +178,7 @@ public void write(scala.collection.Iterator> records) throws IOEx } finally { if (sorter != null) { try { - sorter.cleanupAfterError(); + sorter.cleanupResources(); } catch (Exception e) { // Only throw this error if we won't be masking another // error. @@ -189,6 +210,8 @@ private void open() throws IOException { @VisibleForTesting void closeAndWriteOutput() throws IOException { + assert(sorter != null); + updatePeakMemoryUsed(); serBuffer = null; serOutputStream = null; final SpillInfo[] spills = sorter.closeAndGetSpills(); @@ -209,6 +232,7 @@ void closeAndWriteOutput() throws IOException { @VisibleForTesting void insertRecordIntoSorter(Product2 record) throws IOException { + assert(sorter != null); final K key = record._1(); final int partitionId = partitioner.getPartition(key); serBuffer.reset(); @@ -220,7 +244,7 @@ void insertRecordIntoSorter(Product2 record) throws IOException { assert (serializedRecordSize > 0); sorter.insertRecord( - serBuffer.getBuf(), PlatformDependent.BYTE_ARRAY_OFFSET, serializedRecordSize, partitionId); + serBuffer.getBuf(), Platform.BYTE_ARRAY_OFFSET, serializedRecordSize, partitionId); } @VisibleForTesting @@ -431,6 +455,14 @@ private long[] mergeSpillsWithTransferTo(SpillInfo[] spills, File outputFile) th @Override public Option stop(boolean success) { try { + // Update task metrics from accumulators (null in UnsafeShuffleWriterSuite) + Map> internalAccumulators = + taskContext.internalMetricsToAccumulators(); + if (internalAccumulators != null) { + internalAccumulators.apply(InternalAccumulator.PEAK_EXECUTION_MEMORY()) + .add(getPeakMemoryUsedBytes()); + } + if (stopping) { return Option.apply(null); } else { @@ -450,7 +482,7 @@ public Option stop(boolean success) { if (sorter != null) { // If sorter is non-null, then this implies that we called stop() in response to an error, // so we need to clean up memory and spill files created by the sorter - sorter.cleanupAfterError(); + sorter.cleanupResources(); } } } diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java index 01a66084e918..b24eed3952fd 100644 --- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java +++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java @@ -17,25 +17,24 @@ package org.apache.spark.unsafe.map; -import java.lang.Override; -import java.lang.UnsupportedOperationException; +import javax.annotation.Nullable; import java.util.Iterator; import java.util.LinkedList; import java.util.List; -import javax.annotation.Nullable; - import com.google.common.annotations.VisibleForTesting; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.spark.shuffle.ShuffleMemoryManager; -import org.apache.spark.unsafe.*; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.array.LongArray; import org.apache.spark.unsafe.bitset.BitSet; import org.apache.spark.unsafe.hash.Murmur3_x86_32; -import org.apache.spark.unsafe.memory.*; +import org.apache.spark.unsafe.memory.MemoryBlock; +import org.apache.spark.unsafe.memory.MemoryLocation; +import org.apache.spark.unsafe.memory.TaskMemoryManager; /** * An append-only hash map where keys and values are contiguous regions of bytes. @@ -93,9 +92,9 @@ public final class BytesToBytesMap { /** * The maximum number of keys that BytesToBytesMap supports. The hash table has to be - * power-of-2-sized and its backing Java array can contain at most (1 << 30) elements, since - * that's the largest power-of-2 that's less than Integer.MAX_VALUE. We need two long array - * entries per key, giving us a maximum capacity of (1 << 29). + * power-of-2-sized and its backing Java array can contain at most (1 << 30) elements, + * since that's the largest power-of-2 that's less than Integer.MAX_VALUE. We need two long array + * entries per key, giving us a maximum capacity of (1 << 29). */ @VisibleForTesting static final int MAX_CAPACITY = (1 << 29); @@ -109,7 +108,7 @@ public final class BytesToBytesMap { * Position {@code 2 * i} in the array is used to track a pointer to the key at index {@code i}, * while position {@code 2 * i + 1} in the array holds key's full 32-bit hashcode. */ - private LongArray longArray; + @Nullable private LongArray longArray; // TODO: we're wasting 32 bits of space here; we can probably store fewer bits of the hashcode // and exploit word-alignment to use fewer bits to hold the address. This might let us store // only one long per map entry, increasing the chance that this array will fit in cache at the @@ -124,7 +123,7 @@ public final class BytesToBytesMap { * A {@link BitSet} used to track location of the map where the key is set. * Size of the bitset should be half of the size of the long array. */ - private BitSet bitset; + @Nullable private BitSet bitset; private final double loadFactor; @@ -166,6 +165,8 @@ public final class BytesToBytesMap { private long numHashCollisions = 0; + private long peakMemoryUsedBytes = 0L; + public BytesToBytesMap( TaskMemoryManager taskMemoryManager, ShuffleMemoryManager shuffleMemoryManager, @@ -191,6 +192,11 @@ public BytesToBytesMap( TaskMemoryManager.MAXIMUM_PAGE_SIZE_BYTES); } allocate(initialCapacity); + + // Acquire a new page as soon as we construct the map to ensure that we have at least + // one page to work with. Otherwise, other operators in the same task may starve this + // map (SPARK-9747). + acquireNewPage(); } public BytesToBytesMap( @@ -227,22 +233,35 @@ public static final class BytesToBytesMapIterator implements Iterator private final Iterator dataPagesIterator; private final Location loc; - private MemoryBlock currentPage; + private MemoryBlock currentPage = null; private int currentRecordNumber = 0; private Object pageBaseObject; private long offsetInPage; + // If this iterator destructive or not. When it is true, it frees each page as it moves onto + // next one. + private boolean destructive = false; + private BytesToBytesMap bmap; + private BytesToBytesMapIterator( - int numRecords, Iterator dataPagesIterator, Location loc) { + int numRecords, Iterator dataPagesIterator, Location loc, + boolean destructive, BytesToBytesMap bmap) { this.numRecords = numRecords; this.dataPagesIterator = dataPagesIterator; this.loc = loc; + this.destructive = destructive; + this.bmap = bmap; if (dataPagesIterator.hasNext()) { advanceToNextPage(); } } private void advanceToNextPage() { + if (destructive && currentPage != null) { + dataPagesIterator.remove(); + this.bmap.taskMemoryManager.freePage(currentPage); + this.bmap.shuffleMemoryManager.release(currentPage.size()); + } currentPage = dataPagesIterator.next(); pageBaseObject = currentPage.getBaseObject(); offsetInPage = currentPage.getBaseOffset(); @@ -255,10 +274,10 @@ public boolean hasNext() { @Override public Location next() { - int totalLength = PlatformDependent.UNSAFE.getInt(pageBaseObject, offsetInPage); + int totalLength = Platform.getInt(pageBaseObject, offsetInPage); if (totalLength == END_OF_PAGE_MARKER) { advanceToNextPage(); - totalLength = PlatformDependent.UNSAFE.getInt(pageBaseObject, offsetInPage); + totalLength = Platform.getInt(pageBaseObject, offsetInPage); } loc.with(currentPage, offsetInPage); offsetInPage += 4 + totalLength; @@ -281,7 +300,21 @@ public void remove() { * `lookup()`, the behavior of the returned iterator is undefined. */ public BytesToBytesMapIterator iterator() { - return new BytesToBytesMapIterator(numElements, dataPages.iterator(), loc); + return new BytesToBytesMapIterator(numElements, dataPages.iterator(), loc, false, this); + } + + /** + * Returns a destructive iterator for iterating over the entries of this map. It frees each page + * as it moves onto next one. Notice: it is illegal to call any method on the map after + * `destructiveIterator()` has been called. + * + * For efficiency, all calls to `next()` will return the same {@link Location} object. + * + * If any other lookups or operations are performed on this map while iterating over it, including + * `lookup()`, the behavior of the returned iterator is undefined. + */ + public BytesToBytesMapIterator destructiveIterator() { + return new BytesToBytesMapIterator(numElements, dataPages.iterator(), loc, true, this); } /** @@ -294,6 +327,23 @@ public Location lookup( Object keyBaseObject, long keyBaseOffset, int keyRowLengthBytes) { + safeLookup(keyBaseObject, keyBaseOffset, keyRowLengthBytes, loc); + return loc; + } + + /** + * Looks up a key, and saves the result in provided `loc`. + * + * This is a thread-safe version of `lookup`, could be used by multiple threads. + */ + public void safeLookup( + Object keyBaseObject, + long keyBaseOffset, + int keyRowLengthBytes, + Location loc) { + assert(bitset != null); + assert(longArray != null); + if (enablePerfMetrics) { numKeyLookups++; } @@ -306,7 +356,8 @@ public Location lookup( } if (!bitset.isSet(pos)) { // This is a new key. - return loc.with(pos, hashcode, false); + loc.with(pos, hashcode, false); + return; } else { long stored = longArray.get(pos * 2 + 1); if ((int) (stored) == hashcode) { @@ -324,7 +375,7 @@ public Location lookup( keyRowLengthBytes ); if (areEqual) { - return loc; + return; } else { if (enablePerfMetrics) { numHashCollisions++; @@ -370,9 +421,9 @@ private void updateAddressesAndSizes(long fullKeyAddress) { private void updateAddressesAndSizes(final Object page, final long offsetInPage) { long position = offsetInPage; - final int totalLength = PlatformDependent.UNSAFE.getInt(page, position); + final int totalLength = Platform.getInt(page, position); position += 4; - keyLength = PlatformDependent.UNSAFE.getInt(page, position); + keyLength = Platform.getInt(page, position); position += 4; valueLength = totalLength - keyLength - 4; @@ -383,6 +434,7 @@ private void updateAddressesAndSizes(final Object page, final long offsetInPage) } private Location with(int pos, int keyHashcode, boolean isDefined) { + assert(longArray != null); this.pos = pos; this.isDefined = isDefined; this.keyHashcode = keyHashcode; @@ -498,6 +550,9 @@ public boolean putNewKey( assert (!isDefined) : "Can only set value once for a key"; assert (keyLengthBytes % 8 == 0); assert (valueLengthBytes % 8 == 0); + assert(bitset != null); + assert(longArray != null); + if (numElements == MAX_CAPACITY) { throw new IllegalStateException("BytesToBytesMap has reached maximum capacity"); } @@ -505,7 +560,7 @@ public boolean putNewKey( // Here, we'll copy the data into our data pages. Because we only store a relative offset from // the key address instead of storing the absolute address of the value, the key and value // must be stored in the same memory page. - // (8 byte key length) (key) (8 byte value length) (value) + // (8 byte key length) (key) (value) final long requiredSize = 8 + keyLengthBytes + valueLengthBytes; // --- Figure out where to insert the new record --------------------------------------------- @@ -536,18 +591,11 @@ public boolean putNewKey( // There wasn't enough space in the current page, so write an end-of-page marker: final Object pageBaseObject = currentDataPage.getBaseObject(); final long lengthOffsetInPage = currentDataPage.getBaseOffset() + pageCursor; - PlatformDependent.UNSAFE.putInt(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER); + Platform.putInt(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER); } - final long memoryGranted = shuffleMemoryManager.tryToAcquire(pageSizeBytes); - if (memoryGranted != pageSizeBytes) { - shuffleMemoryManager.release(memoryGranted); - logger.debug("Failed to acquire {} bytes of memory", pageSizeBytes); + if (!acquireNewPage()) { return false; } - MemoryBlock newPage = taskMemoryManager.allocatePage(pageSizeBytes); - dataPages.add(newPage); - pageCursor = 0; - currentDataPage = newPage; dataPage = currentDataPage; dataPageBaseObject = currentDataPage.getBaseObject(); dataPageInsertOffset = currentDataPage.getBaseOffset(); @@ -572,21 +620,21 @@ public boolean putNewKey( final long valueDataOffsetInPage = insertCursor; insertCursor += valueLengthBytes; // word used to store the value size - PlatformDependent.UNSAFE.putInt(dataPageBaseObject, recordOffset, + Platform.putInt(dataPageBaseObject, recordOffset, keyLengthBytes + valueLengthBytes + 4); - PlatformDependent.UNSAFE.putInt(dataPageBaseObject, keyLengthOffset, keyLengthBytes); + Platform.putInt(dataPageBaseObject, keyLengthOffset, keyLengthBytes); // Copy the key - PlatformDependent.copyMemory( + Platform.copyMemory( keyBaseObject, keyBaseOffset, dataPageBaseObject, keyDataOffsetInPage, keyLengthBytes); // Copy the value - PlatformDependent.copyMemory(valueBaseObject, valueBaseOffset, dataPageBaseObject, + Platform.copyMemory(valueBaseObject, valueBaseOffset, dataPageBaseObject, valueDataOffsetInPage, valueLengthBytes); // --- Update bookeeping data structures ----------------------------------------------------- if (useOverflowPage) { // Store the end-of-page marker at the end of the data page - PlatformDependent.UNSAFE.putInt(dataPageBaseObject, insertCursor, END_OF_PAGE_MARKER); + Platform.putInt(dataPageBaseObject, insertCursor, END_OF_PAGE_MARKER); } else { pageCursor += requiredSize; } @@ -606,6 +654,24 @@ public boolean putNewKey( } } + /** + * Acquire a new page from the {@link ShuffleMemoryManager}. + * @return whether there is enough space to allocate the new page. + */ + private boolean acquireNewPage() { + final long memoryGranted = shuffleMemoryManager.tryToAcquire(pageSizeBytes); + if (memoryGranted != pageSizeBytes) { + shuffleMemoryManager.release(memoryGranted); + logger.debug("Failed to acquire {} bytes of memory", pageSizeBytes); + return false; + } + MemoryBlock newPage = taskMemoryManager.allocatePage(pageSizeBytes); + dataPages.add(newPage); + pageCursor = 0; + currentDataPage = newPage; + return true; + } + /** * Allocate new data structures for this map. When calling this outside of the constructor, * make sure to keep references to the old data structures so that you can free them. @@ -615,7 +681,7 @@ public boolean putNewKey( private void allocate(int capacity) { assert (capacity >= 0); // The capacity needs to be divisible by 64 so that our bit set can be sized properly - capacity = Math.max((int) Math.min(MAX_CAPACITY, nextPowerOf2(capacity)), 64); + capacity = Math.max((int) Math.min(MAX_CAPACITY, ByteArrayMethods.nextPowerOf2(capacity)), 64); assert (capacity <= MAX_CAPACITY); longArray = new LongArray(MemoryBlock.fromLongArray(new long[capacity * 2])); bitset = new BitSet(MemoryBlock.fromLongArray(new long[capacity / 64])); @@ -631,6 +697,7 @@ private void allocate(int capacity) { * This method is idempotent and can be called multiple times. */ public void free() { + updatePeakMemoryUsed(); longArray = null; bitset = null; Iterator dataPagesIterator = dataPages.iterator(); @@ -655,13 +722,32 @@ public long getPageSizeBytes() { return pageSizeBytes; } - /** Returns the total amount of memory, in bytes, consumed by this map's managed structures. */ + /** + * Returns the total amount of memory, in bytes, consumed by this map's managed structures. + */ public long getTotalMemoryConsumption() { long totalDataPagesSize = 0L; for (MemoryBlock dataPage : dataPages) { totalDataPagesSize += dataPage.size(); } - return totalDataPagesSize + bitset.memoryBlock().size() + longArray.memoryBlock().size(); + return totalDataPagesSize + + ((bitset != null) ? bitset.memoryBlock().size() : 0L) + + ((longArray != null) ? longArray.memoryBlock().size() : 0L); + } + + private void updatePeakMemoryUsed() { + long mem = getTotalMemoryConsumption(); + if (mem > peakMemoryUsedBytes) { + peakMemoryUsedBytes = mem; + } + } + + /** + * Return the peak memory used so far, in bytes. + */ + public long getPeakMemoryUsedBytes() { + updatePeakMemoryUsed(); + return peakMemoryUsedBytes; } /** @@ -674,7 +760,6 @@ public long getTimeSpentResizingNs() { return timeSpentResizingNs; } - /** * Returns the average number of probes per key lookup. */ @@ -693,7 +778,7 @@ public long getNumHashCollisions() { } @VisibleForTesting - int getNumDataPages() { + public int getNumDataPages() { return dataPages.size(); } @@ -702,6 +787,9 @@ int getNumDataPages() { */ @VisibleForTesting void growAndRehash() { + assert(bitset != null); + assert(longArray != null); + long resizeStartTime = -1; if (enablePerfMetrics) { resizeStartTime = System.nanoTime(); @@ -741,10 +829,4 @@ void growAndRehash() { timeSpentResizingNs += System.nanoTime() - resizeStartTime; } } - - /** Returns the next number greater or equal num that is power of 2. */ - private static long nextPowerOf2(long num) { - final long highBit = Long.highestOneBit(num); - return (highBit == num) ? num : highBit << 1; - } } diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java index 4d7e5b3dfba6..71b76d5ddfaa 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java @@ -20,6 +20,7 @@ import com.google.common.primitives.UnsignedLongs; import org.apache.spark.annotation.Private; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.types.UTF8String; import org.apache.spark.util.Utils; @@ -29,6 +30,8 @@ private PrefixComparators() {} public static final StringPrefixComparator STRING = new StringPrefixComparator(); public static final StringPrefixComparatorDesc STRING_DESC = new StringPrefixComparatorDesc(); + public static final BinaryPrefixComparator BINARY = new BinaryPrefixComparator(); + public static final BinaryPrefixComparatorDesc BINARY_DESC = new BinaryPrefixComparatorDesc(); public static final LongPrefixComparator LONG = new LongPrefixComparator(); public static final LongPrefixComparatorDesc LONG_DESC = new LongPrefixComparatorDesc(); public static final DoublePrefixComparator DOUBLE = new DoublePrefixComparator(); @@ -52,6 +55,38 @@ public int compare(long bPrefix, long aPrefix) { } } + public static final class BinaryPrefixComparator extends PrefixComparator { + @Override + public int compare(long aPrefix, long bPrefix) { + return UnsignedLongs.compare(aPrefix, bPrefix); + } + + public static long computePrefix(byte[] bytes) { + if (bytes == null) { + return 0L; + } else { + /** + * TODO: If a wrapper for BinaryType is created (SPARK-8786), + * these codes below will be in the wrapper class. + */ + final int minLen = Math.min(bytes.length, 8); + long p = 0; + for (int i = 0; i < minLen; ++i) { + p |= (128L + Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i)) + << (56 - 8 * i); + } + return p; + } + } + } + + public static final class BinaryPrefixComparatorDesc extends PrefixComparator { + @Override + public int compare(long bPrefix, long aPrefix) { + return UnsignedLongs.compare(aPrefix, bPrefix); + } + } + public static final class LongPrefixComparator extends PrefixComparator { @Override public int compare(long a, long b) { diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java index b984301cbbf2..fc364e0a895b 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -34,7 +34,8 @@ import org.apache.spark.executor.ShuffleWriteMetrics; import org.apache.spark.shuffle.ShuffleMemoryManager; import org.apache.spark.storage.BlockManager; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.array.ByteArrayMethods; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.memory.MemoryBlock; import org.apache.spark.unsafe.memory.TaskMemoryManager; import org.apache.spark.util.Utils; @@ -70,13 +71,14 @@ public final class UnsafeExternalSorter { private final LinkedList spillWriters = new LinkedList<>(); // These variables are reset after spilling: - private UnsafeInMemorySorter inMemSorter; + @Nullable private UnsafeInMemorySorter inMemSorter; // Whether the in-mem sorter is created internally, or passed in from outside. // If it is passed in from outside, we shouldn't release the in-mem sorter's memory. private boolean isInMemSorterExternal = false; private MemoryBlock currentPage = null; private long currentPagePosition = -1; private long freeSpaceInCurrentPage = 0; + private long peakMemoryUsedBytes = 0; public static UnsafeExternalSorter createWithExistingInMemorySorter( TaskMemoryManager taskMemoryManager, @@ -125,12 +127,15 @@ private UnsafeExternalSorter( // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units // this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024; this.fileBufferSizeBytes = 32 * 1024; - // this.pageSizeBytes = conf.getSizeAsBytes("spark.buffer.pageSize", "64m"); this.pageSizeBytes = pageSizeBytes; this.writeMetrics = new ShuffleWriteMetrics(); if (existingInMemorySorter == null) { initializeForWriting(); + // Acquire a new page as soon as we construct the sorter to ensure that we have at + // least one page to work with. Otherwise, other operators in the same task may starve + // this sorter (SPARK-9709). We don't need to do this if we already have an existing sorter. + acquireNewPage(); } else { this.isInMemSorterExternal = true; this.inMemSorter = existingInMemorySorter; @@ -142,8 +147,7 @@ private UnsafeExternalSorter( taskContext.addOnCompleteCallback(new AbstractFunction0() { @Override public BoxedUnit apply() { - deleteSpillFiles(); - freeMemory(); + cleanupResources(); return null; } }); @@ -183,30 +187,36 @@ public void closeCurrentPage() { * Sort and spill the current records in response to memory pressure. */ public void spill() throws IOException { + assert(inMemSorter != null); logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)", Thread.currentThread().getId(), Utils.bytesToString(getMemoryUsage()), spillWriters.size(), spillWriters.size() > 1 ? " times" : " time"); - final UnsafeSorterSpillWriter spillWriter = - new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics, - inMemSorter.numRecords()); - spillWriters.add(spillWriter); - final UnsafeSorterIterator sortedRecords = inMemSorter.getSortedIterator(); - while (sortedRecords.hasNext()) { - sortedRecords.loadNext(); - final Object baseObject = sortedRecords.getBaseObject(); - final long baseOffset = sortedRecords.getBaseOffset(); - final int recordLength = sortedRecords.getRecordLength(); - spillWriter.write(baseObject, baseOffset, recordLength, sortedRecords.getKeyPrefix()); + // We only write out contents of the inMemSorter if it is not empty. + if (inMemSorter.numRecords() > 0) { + final UnsafeSorterSpillWriter spillWriter = + new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics, + inMemSorter.numRecords()); + spillWriters.add(spillWriter); + final UnsafeSorterIterator sortedRecords = inMemSorter.getSortedIterator(); + while (sortedRecords.hasNext()) { + sortedRecords.loadNext(); + final Object baseObject = sortedRecords.getBaseObject(); + final long baseOffset = sortedRecords.getBaseOffset(); + final int recordLength = sortedRecords.getRecordLength(); + spillWriter.write(baseObject, baseOffset, recordLength, sortedRecords.getKeyPrefix()); + } + spillWriter.close(); } - spillWriter.close(); + final long spillSize = freeMemory(); // Note that this is more-or-less going to be a multiple of the page size, so wasted space in // pages will currently be counted as memory spilled even though that space isn't actually // written to disk. This also counts the space needed to store the sorter's pointer array. taskContext.taskMetrics().incMemoryBytesSpilled(spillSize); + initializeForWriting(); } @@ -219,7 +229,22 @@ private long getMemoryUsage() { for (MemoryBlock page : allocatedPages) { totalPageSize += page.size(); } - return inMemSorter.getMemoryUsage() + totalPageSize; + return ((inMemSorter == null) ? 0 : inMemSorter.getMemoryUsage()) + totalPageSize; + } + + private void updatePeakMemoryUsed() { + long mem = getMemoryUsage(); + if (mem > peakMemoryUsedBytes) { + peakMemoryUsedBytes = mem; + } + } + + /** + * Return the peak memory used so far, in bytes. + */ + public long getPeakMemoryUsedBytes() { + updatePeakMemoryUsed(); + return peakMemoryUsedBytes; } @VisibleForTesting @@ -232,7 +257,8 @@ public int getNumberOfAllocatedPages() { * * @return the number of bytes freed. */ - public long freeMemory() { + private long freeMemory() { + updatePeakMemoryUsed(); long memoryFreed = 0; for (MemoryBlock block : allocatedPages) { taskMemoryManager.freePage(block); @@ -257,42 +283,32 @@ public long freeMemory() { /** * Deletes any spill files created by this sorter. */ - public void deleteSpillFiles() { + private void deleteSpillFiles() { for (UnsafeSorterSpillWriter spill : spillWriters) { File file = spill.getFile(); if (file != null && file.exists()) { if (!file.delete()) { logger.error("Was unable to delete spill file {}", file.getAbsolutePath()); - }; + } } } } /** - * Checks whether there is enough space to insert a new record into the sorter. - * - * @param requiredSpace the required space in the data page, in bytes, including space for storing - * the record size. - - * @return true if the record can be inserted without requiring more allocations, false otherwise. + * Frees this sorter's in-memory data structures and cleans up its spill files. */ - private boolean haveSpaceForRecord(int requiredSpace) { - assert (requiredSpace > 0); - return (inMemSorter.hasSpaceForAnotherRecord() && (requiredSpace <= freeSpaceInCurrentPage)); + public void cleanupResources() { + deleteSpillFiles(); + freeMemory(); } /** - * Allocates more memory in order to insert an additional record. This will request additional - * memory from the {@link ShuffleMemoryManager} and spill if the requested memory can not be - * obtained. - * - * @param requiredSpace the required space in the data page, in bytes, including space for storing - * the record size. + * Checks whether there is enough space to insert an additional record in to the sort pointer + * array and grows the array if additional space is required. If the required space cannot be + * obtained, then the in-memory data will be spilled to disk. */ - private void allocateSpaceForRecord(int requiredSpace) throws IOException { - // TODO: merge these steps to first calculate total memory requirements for this insert, - // then try to acquire; no point in acquiring sort buffer only to spill due to no space in the - // data page. + private void growPointerArrayIfNecessary() throws IOException { + assert(inMemSorter != null); if (!inMemSorter.hasSpaceForAnotherRecord()) { logger.debug("Attempting to expand sort pointer array"); final long oldPointerArrayMemoryUsage = inMemSorter.getMemoryUsage(); @@ -306,7 +322,20 @@ private void allocateSpaceForRecord(int requiredSpace) throws IOException { shuffleMemoryManager.release(oldPointerArrayMemoryUsage); } } + } + /** + * Allocates more memory in order to insert an additional record. This will request additional + * memory from the {@link ShuffleMemoryManager} and spill if the requested memory can not be + * obtained. + * + * @param requiredSpace the required space in the data page, in bytes, including space for storing + * the record size. This must be less than or equal to the page size (records + * that exceed the page size are handled via a different code path which uses + * special overflow pages). + */ + private void acquireNewPageIfNecessary(int requiredSpace) throws IOException { + assert (requiredSpace <= pageSizeBytes); if (requiredSpace > freeSpaceInCurrentPage) { logger.trace("Required space {} is less than free space in current page ({})", requiredSpace, freeSpaceInCurrentPage); @@ -317,24 +346,32 @@ private void allocateSpaceForRecord(int requiredSpace) throws IOException { throw new IOException("Required space " + requiredSpace + " is greater than page size (" + pageSizeBytes + ")"); } else { - final long memoryAcquired = shuffleMemoryManager.tryToAcquire(pageSizeBytes); - if (memoryAcquired < pageSizeBytes) { - if (memoryAcquired > 0) { - shuffleMemoryManager.release(memoryAcquired); - } - spill(); - final long memoryAcquiredAfterSpilling = shuffleMemoryManager.tryToAcquire(pageSizeBytes); - if (memoryAcquiredAfterSpilling != pageSizeBytes) { - shuffleMemoryManager.release(memoryAcquiredAfterSpilling); - throw new IOException("Unable to acquire " + pageSizeBytes + " bytes of memory"); - } - } - currentPage = taskMemoryManager.allocatePage(pageSizeBytes); - currentPagePosition = currentPage.getBaseOffset(); - freeSpaceInCurrentPage = pageSizeBytes; - allocatedPages.add(currentPage); + acquireNewPage(); + } + } + } + + /** + * Acquire a new page from the {@link ShuffleMemoryManager}. + * + * If there is not enough space to allocate the new page, spill all existing ones + * and try again. If there is still not enough space, report error to the caller. + */ + private void acquireNewPage() throws IOException { + final long memoryAcquired = shuffleMemoryManager.tryToAcquire(pageSizeBytes); + if (memoryAcquired < pageSizeBytes) { + shuffleMemoryManager.release(memoryAcquired); + spill(); + final long memoryAcquiredAfterSpilling = shuffleMemoryManager.tryToAcquire(pageSizeBytes); + if (memoryAcquiredAfterSpilling != pageSizeBytes) { + shuffleMemoryManager.release(memoryAcquiredAfterSpilling); + throw new IOException("Unable to acquire " + pageSizeBytes + " bytes of memory"); } } + currentPage = taskMemoryManager.allocatePage(pageSizeBytes); + currentPagePosition = currentPage.getBaseOffset(); + freeSpaceInCurrentPage = pageSizeBytes; + allocatedPages.add(currentPage); } /** @@ -345,25 +382,55 @@ public void insertRecord( long recordBaseOffset, int lengthInBytes, long prefix) throws IOException { + + growPointerArrayIfNecessary(); // Need 4 bytes to store the record length. final int totalSpaceRequired = lengthInBytes + 4; - if (!haveSpaceForRecord(totalSpaceRequired)) { - allocateSpaceForRecord(totalSpaceRequired); + + // --- Figure out where to insert the new record ---------------------------------------------- + + final MemoryBlock dataPage; + long dataPagePosition; + boolean useOverflowPage = totalSpaceRequired > pageSizeBytes; + if (useOverflowPage) { + long overflowPageSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(totalSpaceRequired); + // The record is larger than the page size, so allocate a special overflow page just to hold + // that record. + final long memoryGranted = shuffleMemoryManager.tryToAcquire(overflowPageSize); + if (memoryGranted != overflowPageSize) { + shuffleMemoryManager.release(memoryGranted); + spill(); + final long memoryGrantedAfterSpill = shuffleMemoryManager.tryToAcquire(overflowPageSize); + if (memoryGrantedAfterSpill != overflowPageSize) { + shuffleMemoryManager.release(memoryGrantedAfterSpill); + throw new IOException("Unable to acquire " + overflowPageSize + " bytes of memory"); + } + } + MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize); + allocatedPages.add(overflowPage); + dataPage = overflowPage; + dataPagePosition = overflowPage.getBaseOffset(); + } else { + // The record is small enough to fit in a regular data page, but the current page might not + // have enough space to hold it (or no pages have been allocated yet). + acquireNewPageIfNecessary(totalSpaceRequired); + dataPage = currentPage; + dataPagePosition = currentPagePosition; + // Update bookkeeping information + freeSpaceInCurrentPage -= totalSpaceRequired; + currentPagePosition += totalSpaceRequired; } + final Object dataPageBaseObject = dataPage.getBaseObject(); + + // --- Insert the record ---------------------------------------------------------------------- final long recordAddress = - taskMemoryManager.encodePageNumberAndOffset(currentPage, currentPagePosition); - final Object dataPageBaseObject = currentPage.getBaseObject(); - PlatformDependent.UNSAFE.putInt(dataPageBaseObject, currentPagePosition, lengthInBytes); - currentPagePosition += 4; - PlatformDependent.copyMemory( - recordBaseObject, - recordBaseOffset, - dataPageBaseObject, - currentPagePosition, - lengthInBytes); - currentPagePosition += lengthInBytes; - freeSpaceInCurrentPage -= totalSpaceRequired; + taskMemoryManager.encodePageNumberAndOffset(dataPage, dataPagePosition); + Platform.putInt(dataPageBaseObject, dataPagePosition, lengthInBytes); + dataPagePosition += 4; + Platform.copyMemory( + recordBaseObject, recordBaseOffset, dataPageBaseObject, dataPagePosition, lengthInBytes); + assert(inMemSorter != null); inMemSorter.insertRecord(recordAddress, prefix); } @@ -378,34 +445,71 @@ public void insertRecord( public void insertKVRecord( Object keyBaseObj, long keyOffset, int keyLen, Object valueBaseObj, long valueOffset, int valueLen, long prefix) throws IOException { + + growPointerArrayIfNecessary(); final int totalSpaceRequired = keyLen + valueLen + 4 + 4; - if (!haveSpaceForRecord(totalSpaceRequired)) { - allocateSpaceForRecord(totalSpaceRequired); + + // --- Figure out where to insert the new record ---------------------------------------------- + + final MemoryBlock dataPage; + long dataPagePosition; + boolean useOverflowPage = totalSpaceRequired > pageSizeBytes; + if (useOverflowPage) { + long overflowPageSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(totalSpaceRequired); + // The record is larger than the page size, so allocate a special overflow page just to hold + // that record. + final long memoryGranted = shuffleMemoryManager.tryToAcquire(overflowPageSize); + if (memoryGranted != overflowPageSize) { + shuffleMemoryManager.release(memoryGranted); + spill(); + final long memoryGrantedAfterSpill = shuffleMemoryManager.tryToAcquire(overflowPageSize); + if (memoryGrantedAfterSpill != overflowPageSize) { + shuffleMemoryManager.release(memoryGrantedAfterSpill); + throw new IOException("Unable to acquire " + overflowPageSize + " bytes of memory"); + } + } + MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize); + allocatedPages.add(overflowPage); + dataPage = overflowPage; + dataPagePosition = overflowPage.getBaseOffset(); + } else { + // The record is small enough to fit in a regular data page, but the current page might not + // have enough space to hold it (or no pages have been allocated yet). + acquireNewPageIfNecessary(totalSpaceRequired); + dataPage = currentPage; + dataPagePosition = currentPagePosition; + // Update bookkeeping information + freeSpaceInCurrentPage -= totalSpaceRequired; + currentPagePosition += totalSpaceRequired; } + final Object dataPageBaseObject = dataPage.getBaseObject(); + + // --- Insert the record ---------------------------------------------------------------------- final long recordAddress = - taskMemoryManager.encodePageNumberAndOffset(currentPage, currentPagePosition); - final Object dataPageBaseObject = currentPage.getBaseObject(); - PlatformDependent.UNSAFE.putInt(dataPageBaseObject, currentPagePosition, keyLen + valueLen + 4); - currentPagePosition += 4; + taskMemoryManager.encodePageNumberAndOffset(dataPage, dataPagePosition); + Platform.putInt(dataPageBaseObject, dataPagePosition, keyLen + valueLen + 4); + dataPagePosition += 4; - PlatformDependent.UNSAFE.putInt(dataPageBaseObject, currentPagePosition, keyLen); - currentPagePosition += 4; + Platform.putInt(dataPageBaseObject, dataPagePosition, keyLen); + dataPagePosition += 4; - PlatformDependent.copyMemory( - keyBaseObj, keyOffset, dataPageBaseObject, currentPagePosition, keyLen); - currentPagePosition += keyLen; + Platform.copyMemory(keyBaseObj, keyOffset, dataPageBaseObject, dataPagePosition, keyLen); + dataPagePosition += keyLen; - PlatformDependent.copyMemory( - valueBaseObj, valueOffset, dataPageBaseObject, currentPagePosition, valueLen); - currentPagePosition += valueLen; + Platform.copyMemory(valueBaseObj, valueOffset, dataPageBaseObject, dataPagePosition, valueLen); - freeSpaceInCurrentPage -= totalSpaceRequired; + assert(inMemSorter != null); inMemSorter.insertRecord(recordAddress, prefix); } + /** + * Returns a sorted iterator. It is the caller's responsibility to call `cleanupResources()` + * after consuming this iterator. + */ public UnsafeSorterIterator getSortedIterator() throws IOException { - final UnsafeSorterIterator inMemoryIterator = inMemSorter.getSortedIterator(); + assert(inMemSorter != null); + final UnsafeInMemorySorter.SortedIterator inMemoryIterator = inMemSorter.getSortedIterator(); int numIteratorsToMerge = spillWriters.size() + (inMemoryIterator.hasNext() ? 1 : 0); if (spillWriters.isEmpty()) { return inMemoryIterator; @@ -413,12 +517,11 @@ public UnsafeSorterIterator getSortedIterator() throws IOException { final UnsafeSorterSpillMerger spillMerger = new UnsafeSorterSpillMerger(recordComparator, prefixComparator, numIteratorsToMerge); for (UnsafeSorterSpillWriter spillWriter : spillWriters) { - spillMerger.addSpill(spillWriter.getReader(blockManager)); + spillMerger.addSpillIfNotEmpty(spillWriter.getReader(blockManager)); } spillWriters.clear(); - if (inMemoryIterator.hasNext()) { - spillMerger.addSpill(inMemoryIterator); - } + spillMerger.addSpillIfNotEmpty(inMemoryIterator); + return spillMerger.getSortedIterator(); } } diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java index 313146539190..f7787e1019c2 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java @@ -19,7 +19,7 @@ import java.util.Comparator; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.util.collection.Sorter; import org.apache.spark.unsafe.memory.TaskMemoryManager; @@ -133,7 +133,7 @@ public void insertRecord(long recordPointer, long keyPrefix) { pointerArrayInsertPosition++; } - private static final class SortedIterator extends UnsafeSorterIterator { + public static final class SortedIterator extends UnsafeSorterIterator { private final TaskMemoryManager memoryManager; private final int sortBufferInsertPosition; @@ -144,7 +144,7 @@ private static final class SortedIterator extends UnsafeSorterIterator { private long keyPrefix; private int recordLength; - SortedIterator( + private SortedIterator( TaskMemoryManager memoryManager, int sortBufferInsertPosition, long[] sortBuffer) { @@ -164,7 +164,7 @@ public void loadNext() { final long recordPointer = sortBuffer[position]; baseObject = memoryManager.getPage(recordPointer); baseOffset = memoryManager.getOffsetInPage(recordPointer) + 4; // Skip over record length - recordLength = PlatformDependent.UNSAFE.getInt(baseObject, baseOffset - 4); + recordLength = Platform.getInt(baseObject, baseOffset - 4); keyPrefix = sortBuffer[position + 1]; position += 2; } @@ -186,7 +186,7 @@ public void loadNext() { * Return an iterator over record pointers in sorted order. For efficiency, all calls to * {@code next()} will return the same mutable object. */ - public UnsafeSorterIterator getSortedIterator() { + public SortedIterator getSortedIterator() { sorter.sort(pointerArray, 0, pointerArrayInsertPosition / 2, sortComparator); return new SortedIterator(memoryManager, pointerArrayInsertPosition, pointerArray); } diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java index 8272c2a5be0d..3874a9f9cbdb 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java @@ -47,11 +47,19 @@ public int compare(UnsafeSorterIterator left, UnsafeSorterIterator right) { priorityQueue = new PriorityQueue(numSpills, comparator); } - public void addSpill(UnsafeSorterIterator spillReader) throws IOException { + /** + * Add an UnsafeSorterIterator to this merger + */ + public void addSpillIfNotEmpty(UnsafeSorterIterator spillReader) throws IOException { if (spillReader.hasNext()) { + // We only add the spillReader to the priorityQueue if it is not empty. We do this to + // make sure the hasNext method of UnsafeSorterIterator returned by getSortedIterator + // does not return wrong result because hasNext will returns true + // at least priorityQueue.size() times. If we allow n spillReaders in the + // priorityQueue, we will have n extra empty records in the result of the UnsafeSorterIterator. spillReader.loadNext(); + priorityQueue.add(spillReader); } - priorityQueue.add(spillReader); } public UnsafeSorterIterator getSortedIterator() throws IOException { diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java index ca1ccedc93c8..4989b05d63e2 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java @@ -23,7 +23,7 @@ import org.apache.spark.storage.BlockId; import org.apache.spark.storage.BlockManager; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; /** * Reads spill files written by {@link UnsafeSorterSpillWriter} (see that class for a description @@ -42,7 +42,7 @@ final class UnsafeSorterSpillReader extends UnsafeSorterIterator { private byte[] arr = new byte[1024 * 1024]; private Object baseObject = arr; - private final long baseOffset = PlatformDependent.BYTE_ARRAY_OFFSET; + private final long baseOffset = Platform.BYTE_ARRAY_OFFSET; public UnsafeSorterSpillReader( BlockManager blockManager, diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java index 44cf6c756d7c..e59a84ff8d11 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java @@ -28,7 +28,7 @@ import org.apache.spark.storage.BlockManager; import org.apache.spark.storage.DiskBlockObjectWriter; import org.apache.spark.storage.TempLocalBlockId; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; /** * Spills a list of sorted records to disk. Spill files have the following format: @@ -117,11 +117,11 @@ public void write( long recordReadPosition = baseOffset; while (dataRemaining > 0) { final int toTransfer = Math.min(freeSpaceInWriteBuffer, dataRemaining); - PlatformDependent.copyMemory( + Platform.copyMemory( baseObject, recordReadPosition, writeBuffer, - PlatformDependent.BYTE_ARRAY_OFFSET + (DISK_WRITE_BUFFER_SIZE - freeSpaceInWriteBuffer), + Platform.BYTE_ARRAY_OFFSET + (DISK_WRITE_BUFFER_SIZE - freeSpaceInWriteBuffer), toTransfer); writer.write(writeBuffer, 0, (DISK_WRITE_BUFFER_SIZE - freeSpaceInWriteBuffer) + toTransfer); recordReadPosition += toTransfer; diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js index 4a893bc0189a..83dbea40b63f 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js +++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js @@ -109,13 +109,13 @@ function toggleDagViz(forJob) { } $(function (){ - if (window.localStorage.getItem(expandDagVizArrowKey(false)) == "true") { + if ($("#stage-dag-viz").length && + window.localStorage.getItem(expandDagVizArrowKey(false)) == "true") { // Set it to false so that the click function can revert it window.localStorage.setItem(expandDagVizArrowKey(false), "false"); toggleDagViz(false); - } - - if (window.localStorage.getItem(expandDagVizArrowKey(true)) == "true") { + } else if ($("#job-dag-viz").length && + window.localStorage.getItem(expandDagVizArrowKey(true)) == "true") { // Set it to false so that the click function can revert it window.localStorage.setItem(expandDagVizArrowKey(true), "false"); toggleDagViz(true); diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css index b1cef4704224..04f3070d25b4 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/webui.css +++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css @@ -207,7 +207,7 @@ span.additional-metric-title { /* Hide all additional metrics by default. This is done here rather than using JavaScript to * avoid slow page loads for stage pages with large numbers (e.g., thousands) of tasks. */ .scheduler_delay, .deserialization_time, .fetch_wait_time, .shuffle_read_remote, -.serialization_time, .getting_result_time { +.serialization_time, .getting_result_time, .peak_execution_memory { display: none; } @@ -224,3 +224,11 @@ span.additional-metric-title { a.expandbutton { cursor: pointer; } + +.executor-thread { + background: #E6E6E6; +} + +.non-executor-thread { + background: #FAFAFA; +} \ No newline at end of file diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala index eb75f26718e1..c39c8667d013 100644 --- a/core/src/main/scala/org/apache/spark/Accumulators.scala +++ b/core/src/main/scala/org/apache/spark/Accumulators.scala @@ -152,8 +152,15 @@ class Accumulable[R, T] private[spark] ( in.defaultReadObject() value_ = zero deserialized = true + // Automatically register the accumulator when it is deserialized with the task closure. + // + // Note internal accumulators sent with task are deserialized before the TaskContext is created + // and are registered in the TaskContext constructor. Other internal accumulators, such SQL + // metrics, still need to register here. val taskContext = TaskContext.get() - taskContext.registerAccumulator(this) + if (taskContext != null) { + taskContext.registerAccumulator(this) + } } override def toString: String = if (value_ == null) "null" else value_.toString @@ -248,10 +255,20 @@ GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Serializa * @param param helper object defining how to add elements of type `T` * @tparam T result type */ -class Accumulator[T](@transient initialValue: T, param: AccumulatorParam[T], name: Option[String]) - extends Accumulable[T, T](initialValue, param, name) { +class Accumulator[T] private[spark] ( + @transient private[spark] val initialValue: T, + param: AccumulatorParam[T], + name: Option[String], + internal: Boolean) + extends Accumulable[T, T](initialValue, param, name, internal) { + + def this(initialValue: T, param: AccumulatorParam[T], name: Option[String]) = { + this(initialValue, param, name, false) + } - def this(initialValue: T, param: AccumulatorParam[T]) = this(initialValue, param, None) + def this(initialValue: T, param: AccumulatorParam[T]) = { + this(initialValue, param, None, false) + } } /** @@ -342,3 +359,41 @@ private[spark] object Accumulators extends Logging { } } + +private[spark] object InternalAccumulator { + val PEAK_EXECUTION_MEMORY = "peakExecutionMemory" + val TEST_ACCUMULATOR = "testAccumulator" + + // For testing only. + // This needs to be a def since we don't want to reuse the same accumulator across stages. + private def maybeTestAccumulator: Option[Accumulator[Long]] = { + if (sys.props.contains("spark.testing")) { + Some(new Accumulator( + 0L, AccumulatorParam.LongAccumulatorParam, Some(TEST_ACCUMULATOR), internal = true)) + } else { + None + } + } + + /** + * Accumulators for tracking internal metrics. + * + * These accumulators are created with the stage such that all tasks in the stage will + * add to the same set of accumulators. We do this to report the distribution of accumulator + * values across all tasks within each stage. + */ + def create(sc: SparkContext): Seq[Accumulator[Long]] = { + val internalAccumulators = Seq( + // Execution memory refers to the memory used by internal data structures created + // during shuffles, aggregations and joins. The value of this accumulator should be + // approximately the sum of the peak sizes across all such data structures created + // in this task. For SQL jobs, this only tracks all unsafe operators and ExternalSort. + new Accumulator( + 0L, AccumulatorParam.LongAccumulatorParam, Some(PEAK_EXECUTION_MEMORY), internal = true) + ) ++ maybeTestAccumulator.toSeq + internalAccumulators.foreach { accumulator => + sc.cleaner.foreach(_.registerAccumulatorForCleanup(accumulator)) + } + internalAccumulators + } +} diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala index ceeb58075d34..289aab9bd9e5 100644 --- a/core/src/main/scala/org/apache/spark/Aggregator.scala +++ b/core/src/main/scala/org/apache/spark/Aggregator.scala @@ -58,12 +58,7 @@ case class Aggregator[K, V, C] ( } else { val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners) combiners.insertAll(iter) - // Update task metrics if context is not null - // TODO: Make context non optional in a future release - Option(context).foreach { c => - c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled) - c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled) - } + updateMetrics(context, combiners) combiners.iterator } } @@ -89,13 +84,18 @@ case class Aggregator[K, V, C] ( } else { val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners) combiners.insertAll(iter) - // Update task metrics if context is not null - // TODO: Make context non-optional in a future release - Option(context).foreach { c => - c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled) - c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled) - } + updateMetrics(context, combiners) combiners.iterator } } + + /** Update task metrics after populating the external map. */ + private def updateMetrics(context: TaskContext, map: ExternalAppendOnlyMap[_, _, _]): Unit = { + Option(context).foreach { c => + c.taskMetrics().incMemoryBytesSpilled(map.memoryBytesSpilled) + c.taskMetrics().incDiskBytesSpilled(map.diskBytesSpilled) + c.internalMetricsToAccumulators( + InternalAccumulator.PEAK_EXECUTION_MEMORY).add(map.peakMemoryUsedBytes) + } + } } diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala index 1877aaf2cac5..b93536e6536e 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala @@ -599,14 +599,8 @@ private[spark] class ExecutorAllocationManager( // If this is the last pending task, mark the scheduler queue as empty stageIdToTaskIndices.getOrElseUpdate(stageId, new mutable.HashSet[Int]) += taskIndex - val numTasksScheduled = stageIdToTaskIndices(stageId).size - val numTasksTotal = stageIdToNumTasks.getOrElse(stageId, -1) - if (numTasksScheduled == numTasksTotal) { - // No more pending tasks for this stage - stageIdToNumTasks -= stageId - if (stageIdToNumTasks.isEmpty) { - allocationManager.onSchedulerQueueEmpty() - } + if (totalPendingTasks() == 0) { + allocationManager.onSchedulerQueueEmpty() } // Mark the executor on which this task is scheduled as busy @@ -618,6 +612,8 @@ private[spark] class ExecutorAllocationManager( override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { val executorId = taskEnd.taskInfo.executorId val taskId = taskEnd.taskInfo.taskId + val taskIndex = taskEnd.taskInfo.index + val stageId = taskEnd.stageId allocationManager.synchronized { numRunningTasks -= 1 // If the executor is no longer running any scheduled tasks, mark it as idle @@ -628,6 +624,16 @@ private[spark] class ExecutorAllocationManager( allocationManager.onExecutorIdle(executorId) } } + + // If the task failed, we expect it to be resubmitted later. To ensure we have + // enough resources to run the resubmitted task, we need to mark the scheduler + // as backlogged again if it's not already marked as such (SPARK-8366) + if (taskEnd.reason != Success) { + if (totalPendingTasks() == 0) { + allocationManager.onSchedulerBacklogged() + } + stageIdToTaskIndices.get(stageId).foreach { _.remove(taskIndex) } + } } } diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 08bab4bf2739..b344b5e173d6 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -249,6 +249,13 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { Utils.byteStringAsBytes(get(key, defaultValue)) } + /** + * Get a size parameter as bytes, falling back to a default if not set. + */ + def getSizeAsBytes(key: String, defaultValue: Long): Long = { + Utils.byteStringAsBytes(get(key, defaultValue + "B")) + } + /** * Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no * suffix is provided then Kibibytes are assumed. @@ -382,6 +389,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { val driverOptsKey = "spark.driver.extraJavaOptions" val driverClassPathKey = "spark.driver.extraClassPath" val driverLibraryPathKey = "spark.driver.extraLibraryPath" + val sparkExecutorInstances = "spark.executor.instances" // Used by Yarn in 1.1 and before sys.props.get("spark.driver.libraryPath").foreach { value => @@ -469,6 +477,24 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { } } } + + if (!contains(sparkExecutorInstances)) { + sys.env.get("SPARK_WORKER_INSTANCES").foreach { value => + val warning = + s""" + |SPARK_WORKER_INSTANCES was detected (set to '$value'). + |This is deprecated in Spark 1.0+. + | + |Please instead use: + | - ./spark-submit with --num-executors to specify the number of executors + | - Or set SPARK_EXECUTOR_INSTANCES + | - spark.executor.instances to configure the number of instances in the spark config. + """.stripMargin + logWarning(warning) + + set("spark.executor.instances", value) + } + } } /** diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 4380cf45cc1b..bfeec7c6462b 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -114,13 +114,16 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * :: DeveloperApi :: * Alternative constructor for setting preferred locations where Spark will create executors. * + * @param config a [[org.apache.spark.SparkConf]] object specifying other Spark parameters * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on. * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]] * from a list of input files or InputFormats for the application. */ + @deprecated("Passing in preferred locations has no effect at all, see SPARK-8949", "1.5.0") @DeveloperApi def this(config: SparkConf, preferredNodeLocationData: Map[String, Set[SplitInfo]]) = { this(config) + logWarning("Passing in preferred locations has no effect at all, see SPARK-8949") this.preferredNodeLocationData = preferredNodeLocationData } @@ -143,6 +146,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. * @param environment Environment variables to set on worker nodes. + * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on. + * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]] + * from a list of input files or InputFormats for the application. */ def this( master: String, @@ -153,6 +159,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()) = { this(SparkContext.updatedConf(new SparkConf(), master, appName, sparkHome, jars, environment)) + if (preferredNodeLocationData.nonEmpty) { + logWarning("Passing in preferred locations has no effect at all, see SPARK-8949") + } this.preferredNodeLocationData = preferredNodeLocationData } @@ -528,7 +537,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli } // Optionally scale number of executors dynamically based on workload. Exposed for testing. - val dynamicAllocationEnabled = _conf.getBoolean("spark.dynamicAllocation.enabled", false) + val dynamicAllocationEnabled = Utils.isDynamicAllocationEnabled(_conf) + if (!dynamicAllocationEnabled && _conf.getBoolean("spark.dynamicAllocation.enabled", false)) { + logInfo("Dynamic Allocation and num executors both set, thus dynamic allocation disabled.") + } + _executorAllocationManager = if (dynamicAllocationEnabled) { Some(new ExecutorAllocationManager(this, listenerBus, _conf)) @@ -559,7 +572,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli // Make sure the context is stopped if the user forgets about it. This avoids leaving // unfinished event logs around after the JVM exits cleanly. It doesn't help if the JVM // is killed, though. - _shutdownHookRef = Utils.addShutdownHook(Utils.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () => + _shutdownHookRef = ShutdownHookManager.addShutdownHook( + ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () => logInfo("Invoking stop() from shutdown hook") stop() } @@ -629,7 +643,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * [[org.apache.spark.SparkContext.setLocalProperty]]. */ def getLocalProperty(key: String): String = - Option(localProperties.get).map(_.getProperty(key)).getOrElse(null) + Option(localProperties.get).map(_.getProperty(key)).orNull /** Set a human readable description of the current job. */ def setJobDescription(value: String) { @@ -831,6 +845,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @note Small files are preferred, large file is also allowable, but may cause bad performance. * @note On some filesystems, `.../path/*` can be a more efficient way to read all files * in a directory rather than `.../path/` or `.../path` + * + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param minPartitions A suggestion value of the minimal splitting number for input data. */ def wholeTextFiles( @@ -866,7 +883,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * }}} * * Do - * `val rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`, + * `val rdd = sparkContext.binaryFiles("hdfs://a-hdfs-path")`, * * then `rdd` contains * {{{ @@ -879,6 +896,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @note Small files are preferred; very large files may cause bad performance. * @note On some filesystems, `.../path/*` can be a more efficient way to read all files * in a directory rather than `.../path/` or `.../path` + * + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param minPartitions A suggestion value of the minimal splitting number for input data. */ @Experimental @@ -908,8 +928,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * '''Note:''' We ensure that the byte array for each record in the resulting RDD * has the provided record length. * - * @param path Directory to the input data files + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param recordLength The length at which to split the records + * @param conf Configuration for setting up the dataset. + * * @return An RDD of data with values, represented as byte arrays */ @Experimental @@ -1567,11 +1590,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * Register an RDD to be persisted in memory and/or disk storage */ private[spark] def persistRDD(rdd: RDD[_]) { - _executorAllocationManager.foreach { _ => - logWarning( - s"Dynamic allocation currently does not support cached RDDs. Cached data for RDD " + - s"${rdd.id} will be lost when executors are removed.") - } persistentRdds(rdd.id) = rdd } @@ -1667,7 +1685,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli return } if (_shutdownHookRef != null) { - Utils.removeShutdownHook(_shutdownHookRef) + ShutdownHookManager.removeShutdownHook(_shutdownHookRef) } Utils.tryLogNonFatalError { diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala index adfece4d6e7c..0f1e2e069568 100644 --- a/core/src/main/scala/org/apache/spark/SparkEnv.scala +++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala @@ -324,13 +324,15 @@ object SparkEnv extends Logging { val shuffleMgrClass = shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase, shuffleMgrName) val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass) - val shuffleMemoryManager = new ShuffleMemoryManager(conf) + val shuffleMemoryManager = ShuffleMemoryManager.create(conf, numUsableCores) val blockTransferService = conf.get("spark.shuffle.blockTransferService", "netty").toLowerCase match { case "netty" => new NettyBlockTransferService(conf, securityManager, numUsableCores) case "nio" => + logWarning("NIO-based block transfer service is deprecated, " + + "and will be removed in Spark 1.6.0.") new NioBlockTransferService(conf, securityManager) } diff --git a/core/src/main/scala/org/apache/spark/SparkException.scala b/core/src/main/scala/org/apache/spark/SparkException.scala index 2ebd7a7151a5..977a27bdfe1b 100644 --- a/core/src/main/scala/org/apache/spark/SparkException.scala +++ b/core/src/main/scala/org/apache/spark/SparkException.scala @@ -30,3 +30,10 @@ class SparkException(message: String, cause: Throwable) */ private[spark] class SparkDriverExecutionException(cause: Throwable) extends SparkException("Execution error", cause) + +/** + * Exception thrown when the main user code is run as a child process (e.g. pyspark) and we want + * the parent SparkSubmit process to exit with the same exit code. + */ +private[spark] case class SparkUserAppException(exitCode: Int) + extends SparkException(s"User application exited with $exitCode") diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala index 5d2c551d5851..63cca80b2d73 100644 --- a/core/src/main/scala/org/apache/spark/TaskContext.scala +++ b/core/src/main/scala/org/apache/spark/TaskContext.scala @@ -61,12 +61,12 @@ object TaskContext { protected[spark] def unset(): Unit = taskContext.remove() /** - * Return an empty task context that is not actually used. - * Internal use only. + * An empty task context that does not represent an actual task. */ - private[spark] def empty(): TaskContext = { - new TaskContextImpl(0, 0, 0, 0, null, null) + private[spark] def empty(): TaskContextImpl = { + new TaskContextImpl(0, 0, 0, 0, null, null, Seq.empty) } + } @@ -187,4 +187,9 @@ abstract class TaskContext extends Serializable { * accumulator id and the value of the Map is the latest accumulator local value. */ private[spark] def collectAccumulators(): Map[Long, Any] + + /** + * Accumulators for tracking internal metrics indexed by the name. + */ + private[spark] val internalMetricsToAccumulators: Map[String, Accumulator[Long]] } diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala index 9ee168ae016f..5df94c6d3a10 100644 --- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala +++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala @@ -32,6 +32,7 @@ private[spark] class TaskContextImpl( override val attemptNumber: Int, override val taskMemoryManager: TaskMemoryManager, @transient private val metricsSystem: MetricsSystem, + internalAccumulators: Seq[Accumulator[Long]], val runningLocally: Boolean = false, val taskMetrics: TaskMetrics = TaskMetrics.empty) extends TaskContext @@ -114,4 +115,11 @@ private[spark] class TaskContextImpl( private[spark] override def collectAccumulators(): Map[Long, Any] = synchronized { accumulators.mapValues(_.localValue).toMap } + + private[spark] override val internalMetricsToAccumulators: Map[String, Accumulator[Long]] = { + // Explicitly register internal accumulators here because these are + // not captured in the task closure and are already deserialized + internalAccumulators.foreach(registerAccumulator) + internalAccumulators.map { a => (a.name.get, a) }.toMap + } } diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala index 48fd3e7e23d5..934d00dc708b 100644 --- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala +++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala @@ -17,6 +17,8 @@ package org.apache.spark +import java.io.{IOException, ObjectInputStream, ObjectOutputStream} + import org.apache.spark.annotation.DeveloperApi import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockManagerId @@ -90,6 +92,10 @@ case class FetchFailed( * * `fullStackTrace` is a better representation of the stack trace because it contains the whole * stack trace including the exception and its causes + * + * `exception` is the actual exception that caused the task to fail. It may be `None` in + * the case that the exception is not in fact serializable. If a task fails more than + * once (due to retries), `exception` is that one that caused the last failure. */ @DeveloperApi case class ExceptionFailure( @@ -97,11 +103,26 @@ case class ExceptionFailure( description: String, stackTrace: Array[StackTraceElement], fullStackTrace: String, - metrics: Option[TaskMetrics]) + metrics: Option[TaskMetrics], + private val exceptionWrapper: Option[ThrowableSerializationWrapper]) extends TaskFailedReason { + /** + * `preserveCause` is used to keep the exception itself so it is available to the + * driver. This may be set to `false` in the event that the exception is not in fact + * serializable. + */ + private[spark] def this(e: Throwable, metrics: Option[TaskMetrics], preserveCause: Boolean) { + this(e.getClass.getName, e.getMessage, e.getStackTrace, Utils.exceptionString(e), metrics, + if (preserveCause) Some(new ThrowableSerializationWrapper(e)) else None) + } + private[spark] def this(e: Throwable, metrics: Option[TaskMetrics]) { - this(e.getClass.getName, e.getMessage, e.getStackTrace, Utils.exceptionString(e), metrics) + this(e, metrics, preserveCause = true) + } + + def exception: Option[Throwable] = exceptionWrapper.flatMap { + (w: ThrowableSerializationWrapper) => Option(w.exception) } override def toErrorString: String = @@ -127,6 +148,25 @@ case class ExceptionFailure( } } +/** + * A class for recovering from exceptions when deserializing a Throwable that was + * thrown in user task code. If the Throwable cannot be deserialized it will be null, + * but the stacktrace and message will be preserved correctly in SparkException. + */ +private[spark] class ThrowableSerializationWrapper(var exception: Throwable) extends + Serializable with Logging { + private def writeObject(out: ObjectOutputStream): Unit = { + out.writeObject(exception) + } + private def readObject(in: ObjectInputStream): Unit = { + try { + exception = in.readObject().asInstanceOf[Throwable] + } catch { + case e : Exception => log.warn("Task exception could not be deserialized", e) + } + } +} + /** * :: DeveloperApi :: * The task finished successfully, but the result was lost from the executor's block manager before diff --git a/core/src/main/scala/org/apache/spark/annotation/Since.scala b/core/src/main/scala/org/apache/spark/annotation/Since.scala new file mode 100644 index 000000000000..af483e361e33 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/annotation/Since.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.annotation + +import scala.annotation.StaticAnnotation +import scala.annotation.meta._ + +/** + * A Scala annotation that specifies the Spark version when a definition was added. + * Different from the `@since` tag in JavaDoc, this annotation does not require explicit JavaDoc and + * hence works for overridden methods that inherit API documentation directly from parents. + * The limitation is that it does not show up in the generated Java API documentation. + */ +@param @field @getter @setter @beanGetter @beanSetter +private[spark] class Since(version: String) extends StaticAnnotation diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index 829fae1d1d9b..c582488f16fe 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -354,7 +354,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { * Return an array that contains all of the elements in this RDD. * @deprecated As of Spark 1.0.0, toArray() is deprecated, use {@link #collect()} instead */ - @Deprecated + @deprecated("use collect()", "1.0.0") def toArray(): JList[T] = collect() /** diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 55e563ee968b..2a56bf28d702 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -794,7 +794,7 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort: /** * We try to reuse a single Socket to transfer accumulator updates, as they are all added - * by the DAGScheduler's single-threaded actor anyway. + * by the DAGScheduler's single-threaded RpcEndpoint anyway. */ @transient var socket: Socket = _ diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala index 14dac4ed28ce..6ce02e2ea336 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala @@ -182,6 +182,7 @@ private[r] class RBackendHandler(server: RBackend) if (parameterType.isPrimitive) { parameterWrapperType = parameterType match { case java.lang.Integer.TYPE => classOf[java.lang.Integer] + case java.lang.Long.TYPE => classOf[java.lang.Integer] case java.lang.Double.TYPE => classOf[java.lang.Double] case java.lang.Boolean.TYPE => classOf[java.lang.Boolean] case _ => parameterType diff --git a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala index d53abd3408c5..93b3bea57867 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala @@ -19,6 +19,8 @@ package org.apache.spark.api.r import java.io.File +import scala.collection.JavaConversions._ + import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { @@ -26,7 +28,7 @@ private[spark] object RUtils { * Get the SparkR package path in the local spark distribution. */ def localSparkRPackagePath: Option[String] = { - val sparkHome = sys.env.get("SPARK_HOME") + val sparkHome = sys.env.get("SPARK_HOME").orElse(sys.props.get("spark.test.home")) sparkHome.map( Seq(_, "R", "lib").mkString(File.separator) ) @@ -46,8 +48,8 @@ private[spark] object RUtils { (sparkConf.get("spark.master"), sparkConf.get("spark.submit.deployMode")) } - val isYarnCluster = master.contains("yarn") && deployMode == "cluster" - val isYarnClient = master.contains("yarn") && deployMode == "client" + val isYarnCluster = master != null && master.contains("yarn") && deployMode == "cluster" + val isYarnClient = master != null && master.contains("yarn") && deployMode == "client" // In YARN mode, the SparkR package is distributed as an archive symbolically // linked to the "sparkr" file in the current directory. Note that this does not apply @@ -62,4 +64,10 @@ private[spark] object RUtils { } } } + + /** Check if R is installed before running tests that use R commands. */ + def isRInstalled: Boolean = { + val builder = new ProcessBuilder(Seq("R", "--version")) + builder.start().waitFor() == 0 + } } diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala index d5b4260bf452..3c89f2447374 100644 --- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala +++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala @@ -181,6 +181,7 @@ private[spark] object SerDe { // Boolean -> logical // Float -> double // Double -> double + // Decimal -> double // Long -> double // Array[Byte] -> raw // Date -> Date @@ -219,6 +220,10 @@ private[spark] object SerDe { case "float" | "java.lang.Float" => writeType(dos, "double") writeDouble(dos, value.asInstanceOf[Float].toDouble) + case "decimal" | "java.math.BigDecimal" => + writeType(dos, "double") + val javaDecimal = value.asInstanceOf[java.math.BigDecimal] + writeDouble(dos, scala.math.BigDecimal(javaDecimal).toDouble) case "double" | "java.lang.Double" => writeType(dos, "double") writeDouble(dos, value.asInstanceOf[Double]) diff --git a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala index 53356addf6ed..83ccaadfe744 100644 --- a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala +++ b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala @@ -73,12 +73,8 @@ class LocalSparkCluster( def stop() { logInfo("Shutting down local Spark cluster.") // Stop the workers before the master so they don't get upset that it disconnected - // TODO: In Akka 2.1.x, ActorSystem.awaitTermination hangs when you have remote actors! - // This is unfortunate, but for now we just comment it out. workerRpcEnvs.foreach(_.shutdown()) - // workerActorSystems.foreach(_.awaitTermination()) masterRpcEnvs.foreach(_.shutdown()) - // masterActorSystems.foreach(_.awaitTermination()) masterRpcEnvs.clear() workerRpcEnvs.clear() } diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala index c2ed43a5397d..23d01e9cbb9f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala @@ -24,6 +24,7 @@ import scala.collection.mutable.ArrayBuffer import scala.collection.JavaConversions._ import scala.util.Try +import org.apache.spark.SparkUserAppException import org.apache.spark.api.python.PythonUtils import org.apache.spark.util.{RedirectThread, Utils} @@ -46,7 +47,20 @@ object PythonRunner { // Launch a Py4J gateway server for the process to connect to; this will let it see our // Java system properties and such val gatewayServer = new py4j.GatewayServer(null, 0) - gatewayServer.start() + val thread = new Thread(new Runnable() { + override def run(): Unit = Utils.logUncaughtExceptions { + gatewayServer.start() + } + }) + thread.setName("py4j-gateway-init") + thread.setDaemon(true) + thread.start() + + // Wait until the gateway server has started, so that we know which port is it bound to. + // `gatewayServer.start()` will start a new thread and run the server code there, after + // initializing the socket, so the thread started above will end as soon as the server is + // ready to serve connections. + thread.join() // Build up a PYTHONPATH that includes the Spark assembly JAR (where this class is), the // python directories in SPARK_HOME (if set), and any files in the pyFiles argument @@ -64,11 +78,18 @@ object PythonRunner { env.put("PYTHONUNBUFFERED", "YES") // value is needed to be set to a non-empty string env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort) builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize - val process = builder.start() + try { + val process = builder.start() - new RedirectThread(process.getInputStream, System.out, "redirect output").start() + new RedirectThread(process.getInputStream, System.out, "redirect output").start() - System.exit(process.waitFor()) + val exitCode = process.waitFor() + if (exitCode != 0) { + throw new SparkUserAppException(exitCode) + } + } finally { + gatewayServer.shutdown() + } } /** diff --git a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala new file mode 100644 index 000000000000..ed1e97295567 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy + +import java.io._ +import java.util.jar.JarFile +import java.util.logging.Level +import java.util.zip.{ZipEntry, ZipOutputStream} + +import scala.collection.JavaConversions._ + +import com.google.common.io.{ByteStreams, Files} + +import org.apache.spark.{SparkException, Logging} +import org.apache.spark.api.r.RUtils +import org.apache.spark.util.{RedirectThread, Utils} + +private[deploy] object RPackageUtils extends Logging { + + /** The key in the MANIFEST.mf that we look for, in case a jar contains R code. */ + private final val hasRPackage = "Spark-HasRPackage" + + /** Base of the shell command used in order to install R packages. */ + private final val baseInstallCmd = Seq("R", "CMD", "INSTALL", "-l") + + /** R source code should exist under R/pkg in a jar. */ + private final val RJarEntries = "R/pkg" + + /** Documentation on how the R source file layout should be in the jar. */ + private[deploy] final val RJarDoc = + s"""In order for Spark to build R packages that are parts of Spark Packages, there are a few + |requirements. The R source code must be shipped in a jar, with additional Java/Scala + |classes. The jar must be in the following format: + | 1- The Manifest (META-INF/MANIFEST.mf) must contain the key-value: $hasRPackage: true + | 2- The standard R package layout must be preserved under R/pkg/ inside the jar. More + | information on the standard R package layout can be found in: + | http://cran.r-project.org/doc/contrib/Leisch-CreatingPackages.pdf + | An example layout is given below. After running `jar tf $$JAR_FILE | sort`: + | + |META-INF/MANIFEST.MF + |R/ + |R/pkg/ + |R/pkg/DESCRIPTION + |R/pkg/NAMESPACE + |R/pkg/R/ + |R/pkg/R/myRcode.R + |org/ + |org/apache/ + |... + """.stripMargin.trim + + /** Internal method for logging. We log to a printStream in tests, for debugging purposes. */ + private def print( + msg: String, + printStream: PrintStream, + level: Level = Level.FINE, + e: Throwable = null): Unit = { + if (printStream != null) { + // scalastyle:off println + printStream.println(msg) + // scalastyle:on println + if (e != null) { + e.printStackTrace(printStream) + } + } else { + level match { + case Level.INFO => logInfo(msg) + case Level.WARNING => logWarning(msg) + case Level.SEVERE => logError(msg, e) + case _ => logDebug(msg) + } + } + } + + /** + * Checks the manifest of the Jar whether there is any R source code bundled with it. + * Exposed for testing. + */ + private[deploy] def checkManifestForR(jar: JarFile): Boolean = { + val manifest = jar.getManifest.getMainAttributes + manifest.getValue(hasRPackage) != null && manifest.getValue(hasRPackage).trim == "true" + } + + /** + * Runs the standard R package installation code to build the R package from source. + * Multiple runs don't cause problems. + */ + private def rPackageBuilder(dir: File, printStream: PrintStream, verbose: Boolean): Boolean = { + // this code should be always running on the driver. + val pathToSparkR = RUtils.localSparkRPackagePath.getOrElse( + throw new SparkException("SPARK_HOME not set. Can't locate SparkR package.")) + val pathToPkg = Seq(dir, "R", "pkg").mkString(File.separator) + val installCmd = baseInstallCmd ++ Seq(pathToSparkR, pathToPkg) + if (verbose) { + print(s"Building R package with the command: $installCmd", printStream) + } + try { + val builder = new ProcessBuilder(installCmd) + builder.redirectErrorStream(true) + val env = builder.environment() + env.clear() + val process = builder.start() + new RedirectThread(process.getInputStream, printStream, "redirect R packaging").start() + process.waitFor() == 0 + } catch { + case e: Throwable => + print("Failed to build R package.", printStream, Level.SEVERE, e) + false + } + } + + /** + * Extracts the files under /R in the jar to a temporary directory for building. + */ + private def extractRFolder(jar: JarFile, printStream: PrintStream, verbose: Boolean): File = { + val tempDir = Utils.createTempDir(null) + val jarEntries = jar.entries() + while (jarEntries.hasMoreElements) { + val entry = jarEntries.nextElement() + val entryRIndex = entry.getName.indexOf(RJarEntries) + if (entryRIndex > -1) { + val entryPath = entry.getName.substring(entryRIndex) + if (entry.isDirectory) { + val dir = new File(tempDir, entryPath) + if (verbose) { + print(s"Creating directory: $dir", printStream) + } + dir.mkdirs + } else { + val inStream = jar.getInputStream(entry) + val outPath = new File(tempDir, entryPath) + Files.createParentDirs(outPath) + val outStream = new FileOutputStream(outPath) + if (verbose) { + print(s"Extracting $entry to $outPath", printStream) + } + Utils.copyStream(inStream, outStream, closeStreams = true) + } + } + } + tempDir + } + + /** + * Extracts the files under /R in the jar to a temporary directory for building. + */ + private[deploy] def checkAndBuildRPackage( + jars: String, + printStream: PrintStream = null, + verbose: Boolean = false): Unit = { + jars.split(",").foreach { jarPath => + val file = new File(Utils.resolveURI(jarPath)) + if (file.exists()) { + val jar = new JarFile(file) + if (checkManifestForR(jar)) { + print(s"$file contains R source code. Now installing package.", printStream, Level.INFO) + val rSource = extractRFolder(jar, printStream, verbose) + try { + if (!rPackageBuilder(rSource, printStream, verbose)) { + print(s"ERROR: Failed to build R package in $file.", printStream) + print(RJarDoc, printStream) + } + } finally { + rSource.delete() // clean up + } + } else { + if (verbose) { + print(s"$file doesn't contain R source code, skipping...", printStream) + } + } + } else { + print(s"WARN: $file resolved as dependency, but not found.", printStream, Level.WARNING) + } + } + } + + private def listFilesRecursively(dir: File, excludePatterns: Seq[String]): Set[File] = { + if (!dir.exists()) { + Set.empty[File] + } else { + if (dir.isDirectory) { + val subDir = dir.listFiles(new FilenameFilter { + override def accept(dir: File, name: String): Boolean = { + !excludePatterns.map(name.contains).reduce(_ || _) // exclude files with given pattern + } + }) + subDir.flatMap(listFilesRecursively(_, excludePatterns)).toSet + } else { + Set(dir) + } + } + } + + /** Zips all the libraries found with SparkR in the R/lib directory for distribution with Yarn. */ + private[deploy] def zipRLibraries(dir: File, name: String): File = { + val filesToBundle = listFilesRecursively(dir, Seq(".zip")) + // create a zip file from scratch, do not append to existing file. + val zipFile = new File(dir, name) + zipFile.delete() + val zipOutputStream = new ZipOutputStream(new FileOutputStream(zipFile, false)) + try { + filesToBundle.foreach { file => + // get the relative paths for proper naming in the zip file + val relPath = file.getAbsolutePath.replaceFirst(dir.getAbsolutePath, "") + val fis = new FileInputStream(file) + val zipEntry = new ZipEntry(relPath) + zipOutputStream.putNextEntry(zipEntry) + ByteStreams.copy(fis, zipOutputStream) + zipOutputStream.closeEntry() + fis.close() + } + } finally { + zipOutputStream.close() + } + zipFile + } +} diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index e06b06e06fb4..dda4216c7efe 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -34,6 +34,8 @@ import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter} import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapreduce.JobContext +import org.apache.hadoop.mapreduce.{TaskAttemptContext => MapReduceTaskAttemptContext} +import org.apache.hadoop.mapreduce.{TaskAttemptID => MapReduceTaskAttemptID} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.annotation.DeveloperApi @@ -74,7 +76,7 @@ class SparkHadoopUtil extends Logging { } } - @Deprecated + @deprecated("use newConfiguration with SparkConf argument", "1.2.0") def newConfiguration(): Configuration = newConfiguration(null) /** @@ -194,6 +196,18 @@ class SparkHadoopUtil extends Logging { method.invoke(context).asInstanceOf[Configuration] } + /** + * Using reflection to call `getTaskAttemptID` from TaskAttemptContext. If we directly + * call `TaskAttemptContext.getTaskAttemptID`, it will generate different byte codes + * for Hadoop 1.+ and Hadoop 2.+ because TaskAttemptContext is class in Hadoop 1.+ + * while it's interface in Hadoop 2.+. + */ + def getTaskAttemptIDFromTaskAttemptContext( + context: MapReduceTaskAttemptContext): MapReduceTaskAttemptID = { + val method = context.getClass.getMethod("getTaskAttemptID") + method.invoke(context).asInstanceOf[MapReduceTaskAttemptID] + } + /** * Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the * given path points to a file, return a single-element collection containing [[FileStatus]] of diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 0b39ee8fe3ba..86fcf942c2c4 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -24,6 +24,7 @@ import java.security.PrivilegedExceptionAction import scala.collection.mutable.{ArrayBuffer, HashMap, Map} +import org.apache.commons.lang3.StringUtils import org.apache.hadoop.fs.Path import org.apache.hadoop.security.UserGroupInformation import org.apache.ivy.Ivy @@ -37,8 +38,9 @@ import org.apache.ivy.core.settings.IvySettings import org.apache.ivy.plugins.matcher.GlobPatternMatcher import org.apache.ivy.plugins.repository.file.FileRepository import org.apache.ivy.plugins.resolver.{FileSystemResolver, ChainResolver, IBiblioResolver} + +import org.apache.spark.{SparkUserAppException, SPARK_VERSION} import org.apache.spark.api.r.RUtils -import org.apache.spark.SPARK_VERSION import org.apache.spark.deploy.rest._ import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils} @@ -275,24 +277,27 @@ object SparkSubmit { // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files // too for packages that include Python code - val resolvedMavenCoordinates = - SparkSubmitUtils.resolveMavenCoordinates( - args.packages, Option(args.repositories), Option(args.ivyRepoPath)) - if (!resolvedMavenCoordinates.trim.isEmpty) { - if (args.jars == null || args.jars.trim.isEmpty) { - args.jars = resolvedMavenCoordinates + val exclusions: Seq[String] = + if (!StringUtils.isBlank(args.packagesExclusions)) { + args.packagesExclusions.split(",") } else { - args.jars += s",$resolvedMavenCoordinates" + Nil } + val resolvedMavenCoordinates = SparkSubmitUtils.resolveMavenCoordinates(args.packages, + Option(args.repositories), Option(args.ivyRepoPath), exclusions = exclusions) + if (!StringUtils.isBlank(resolvedMavenCoordinates)) { + args.jars = mergeFileLists(args.jars, resolvedMavenCoordinates) if (args.isPython) { - if (args.pyFiles == null || args.pyFiles.trim.isEmpty) { - args.pyFiles = resolvedMavenCoordinates - } else { - args.pyFiles += s",$resolvedMavenCoordinates" - } + args.pyFiles = mergeFileLists(args.pyFiles, resolvedMavenCoordinates) } } + // install any R packages that may have been passed through --jars or --packages. + // Spark Packages may contain R source code inside the jar. + if (args.isR && !StringUtils.isBlank(args.jars)) { + RPackageUtils.checkAndBuildRPackage(args.jars, printStream, args.verbose) + } + // Require all python files to be local, so we can add them to the PYTHONPATH // In YARN cluster mode, python files are distributed as regular files, which can be non-local if (args.isPython && !isYarnCluster) { @@ -362,7 +367,8 @@ object SparkSubmit { if (rPackagePath.isEmpty) { printErrorAndExit("SPARK_HOME does not exist for R application in YARN mode.") } - val rPackageFile = new File(rPackagePath.get, SPARKR_PACKAGE_ARCHIVE) + val rPackageFile = + RPackageUtils.zipRLibraries(new File(rPackagePath.get), SPARKR_PACKAGE_ARCHIVE) if (!rPackageFile.exists()) { printErrorAndExit(s"$SPARKR_PACKAGE_ARCHIVE does not exist for R application in YARN mode.") } @@ -416,7 +422,8 @@ object SparkSubmit { // Yarn client only OptionAssigner(args.queue, YARN, CLIENT, sysProp = "spark.yarn.queue"), - OptionAssigner(args.numExecutors, YARN, CLIENT, sysProp = "spark.executor.instances"), + OptionAssigner(args.numExecutors, YARN, ALL_DEPLOY_MODES, + sysProp = "spark.executor.instances"), OptionAssigner(args.files, YARN, CLIENT, sysProp = "spark.yarn.dist.files"), OptionAssigner(args.archives, YARN, CLIENT, sysProp = "spark.yarn.dist.archives"), OptionAssigner(args.principal, YARN, CLIENT, sysProp = "spark.yarn.principal"), @@ -427,7 +434,6 @@ object SparkSubmit { OptionAssigner(args.driverMemory, YARN, CLUSTER, clOption = "--driver-memory"), OptionAssigner(args.driverCores, YARN, CLUSTER, clOption = "--driver-cores"), OptionAssigner(args.queue, YARN, CLUSTER, clOption = "--queue"), - OptionAssigner(args.numExecutors, YARN, CLUSTER, clOption = "--num-executors"), OptionAssigner(args.executorMemory, YARN, CLUSTER, clOption = "--executor-memory"), OptionAssigner(args.executorCores, YARN, CLUSTER, clOption = "--executor-cores"), OptionAssigner(args.files, YARN, CLUSTER, clOption = "--files"), @@ -666,7 +672,13 @@ object SparkSubmit { mainMethod.invoke(null, childArgs.toArray) } catch { case t: Throwable => - throw findCause(t) + findCause(t) match { + case SparkUserAppException(exitCode) => + System.exit(exitCode) + + case t: Throwable => + throw t + } } } @@ -736,7 +748,7 @@ object SparkSubmit { * no files, into a single comma-separated string. */ private def mergeFileLists(lists: String*): String = { - val merged = lists.filter(_ != null) + val merged = lists.filterNot(StringUtils.isBlank) .flatMap(_.split(",")) .mkString(",") if (merged == "") null else merged @@ -938,7 +950,7 @@ private[spark] object SparkSubmitUtils { // are supplied to spark-submit val alternateIvyCache = ivyPath.getOrElse("") val packagesDirectory: File = - if (alternateIvyCache.trim.isEmpty) { + if (alternateIvyCache == null || alternateIvyCache.trim.isEmpty) { new File(ivySettings.getDefaultIvyUserDir, "jars") } else { ivySettings.setDefaultIvyUserDir(new File(alternateIvyCache)) @@ -988,11 +1000,9 @@ private[spark] object SparkSubmitUtils { addExclusionRules(ivySettings, ivyConfName, md) // add all supplied maven artifacts as dependencies addDependenciesToIvy(md, artifacts, ivyConfName) - exclusions.foreach { e => md.addExcludeRule(createExclusion(e + ":*", ivySettings, ivyConfName)) } - // resolve dependencies val rr: ResolveReport = ivy.resolve(md, resolveOptions) if (rr.hasError) { @@ -1010,7 +1020,7 @@ private[spark] object SparkSubmitUtils { } } - private def createExclusion( + private[deploy] def createExclusion( coords: String, ivySettings: IvySettings, ivyConfName: String): ExcludeRule = { diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index b3710073e330..3f3c6627c21f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -59,6 +59,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S var packages: String = null var repositories: String = null var ivyRepoPath: String = null + var packagesExclusions: String = null var verbose: Boolean = false var isPython: Boolean = false var pyFiles: String = null @@ -172,6 +173,9 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S name = Option(name).orElse(sparkProperties.get("spark.app.name")).orNull jars = Option(jars).orElse(sparkProperties.get("spark.jars")).orNull ivyRepoPath = sparkProperties.get("spark.jars.ivy").orNull + packages = Option(packages).orElse(sparkProperties.get("spark.jars.packages")).orNull + packagesExclusions = Option(packagesExclusions) + .orElse(sparkProperties.get("spark.jars.excludes")).orNull deployMode = Option(deployMode).orElse(env.get("DEPLOY_MODE")).orNull numExecutors = Option(numExecutors) .getOrElse(sparkProperties.get("spark.executor.instances").orNull) @@ -299,6 +303,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S | childArgs [${childArgs.mkString(" ")}] | jars $jars | packages $packages + | packagesExclusions $packagesExclusions | repositories $repositories | verbose $verbose | @@ -391,6 +396,9 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S case PACKAGES => packages = value + case PACKAGES_EXCLUDE => + packagesExclusions = value + case REPOSITORIES => repositories = value @@ -482,6 +490,9 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S | maven repo, then maven central and any additional remote | repositories given by --repositories. The format for the | coordinates should be groupId:artifactId:version. + | --exclude-packages Comma-separated list of groupId:artifactId, to exclude while + | resolving the dependencies provided in --packages to avoid + | dependency conflicts. | --repositories Comma-separated list of additional remote repositories to | search for the maven coordinates given with --packages. | --py-files PY_FILES Comma-separated list of .zip, .egg, or .py files to place @@ -600,5 +611,4 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S System.setErr(currentErr) } } - } diff --git a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala index 7576a2985ee7..25ea6925434a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala @@ -257,7 +257,7 @@ private[spark] class AppClient( } def start() { - // Just launch an actor; it will call back into the listener. + // Just launch an rpcEndpoint; it will call back into the listener. endpoint = rpcEnv.setupEndpoint("AppClient", new ClientEndpoint(rpcEnv)) } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index e3060ac3fa1a..e573ff16c50a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -126,11 +126,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // Disable the background thread during tests. if (!conf.contains("spark.testing")) { // A task that periodically checks for event log updates on disk. - pool.scheduleAtFixedRate(getRunner(checkForLogs), 0, UPDATE_INTERVAL_S, TimeUnit.SECONDS) + pool.scheduleWithFixedDelay(getRunner(checkForLogs), 0, UPDATE_INTERVAL_S, TimeUnit.SECONDS) if (conf.getBoolean("spark.history.fs.cleaner.enabled", false)) { // A task that periodically cleans event logs on disk. - pool.scheduleAtFixedRate(getRunner(cleanLogs), 0, CLEAN_INTERVAL_S, TimeUnit.SECONDS) + pool.scheduleWithFixedDelay(getRunner(cleanLogs), 0, CLEAN_INTERVAL_S, TimeUnit.SECONDS) } } } @@ -204,11 +204,25 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) mod1 >= mod2 } - logInfos.sliding(20, 20).foreach { batch => - replayExecutor.submit(new Runnable { - override def run(): Unit = mergeApplicationListing(batch) - }) - } + logInfos.grouped(20) + .map { batch => + replayExecutor.submit(new Runnable { + override def run(): Unit = mergeApplicationListing(batch) + }) + } + .foreach { task => + try { + // Wait for all tasks to finish. This makes sure that checkForLogs + // is not scheduled again while some tasks are already running in + // the replayExecutor. + task.get() + } catch { + case e: InterruptedException => + throw e + case e: Exception => + logError("Exception while merging application listings", e) + } + } lastModifiedTime = newLastModifiedTime } catch { @@ -272,9 +286,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) * Replay the log files in the list and merge the list of old applications with new ones */ private def mergeApplicationListing(logs: Seq[FileStatus]): Unit = { - val bus = new ReplayListenerBus() val newAttempts = logs.flatMap { fileStatus => try { + val bus = new ReplayListenerBus() val res = replay(fileStatus, bus) res match { case Some(r) => logDebug(s"Application log ${r.logPath} loaded successfully.") diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index a076a9c3f984..d4f327cc588f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala @@ -30,7 +30,7 @@ import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationInfo, Applica UIRoot} import org.apache.spark.ui.{SparkUI, UIUtils, WebUI} import org.apache.spark.ui.JettyUtils._ -import org.apache.spark.util.{SignalLogger, Utils} +import org.apache.spark.util.{ShutdownHookManager, SignalLogger, Utils} /** * A web server that renders SparkUIs of completed applications. @@ -238,7 +238,7 @@ object HistoryServer extends Logging { val server = new HistoryServer(conf, provider, securityManager, port) server.bind() - Utils.addShutdownHook { () => server.stop() } + ShutdownHookManager.addShutdownHook { () => server.stop() } // Wait until the end of the world... or if the HistoryServer process is manually stopped while(true) { Thread.sleep(Int.MaxValue) } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/LeaderElectionAgent.scala b/core/src/main/scala/org/apache/spark/deploy/master/LeaderElectionAgent.scala index cf77c86d760c..70f21fbe0de8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/LeaderElectionAgent.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/LeaderElectionAgent.scala @@ -26,7 +26,7 @@ import org.apache.spark.annotation.DeveloperApi */ @DeveloperApi trait LeaderElectionAgent { - val masterActor: LeaderElectable + val masterInstance: LeaderElectable def stop() {} // to avoid noops in implementations. } @@ -37,7 +37,7 @@ trait LeaderElectable { } /** Single-node implementation of LeaderElectionAgent -- we're initially and always the leader. */ -private[spark] class MonarchyLeaderAgent(val masterActor: LeaderElectable) +private[spark] class MonarchyLeaderAgent(val masterInstance: LeaderElectable) extends LeaderElectionAgent { - masterActor.electedLeader() + masterInstance.electedLeader() } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index e38e437fe1c5..26904d39a9be 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -127,14 +127,8 @@ private[deploy] class Master( // Alternative application submission gateway that is stable across Spark versions private val restServerEnabled = conf.getBoolean("spark.master.rest.enabled", true) - private val restServer = - if (restServerEnabled) { - val port = conf.getInt("spark.master.rest.port", 6066) - Some(new StandaloneRestServer(address.host, port, conf, self, masterUrl)) - } else { - None - } - private val restServerBoundPort = restServer.map(_.start()) + private var restServer: Option[StandaloneRestServer] = None + private var restServerBoundPort: Option[Int] = None override def onStart(): Unit = { logInfo("Starting Spark master at " + masterUrl) @@ -148,6 +142,12 @@ private[deploy] class Master( } }, 0, WORKER_TIMEOUT_MS, TimeUnit.MILLISECONDS) + if (restServerEnabled) { + val port = conf.getInt("spark.master.rest.port", 6066) + restServer = Some(new StandaloneRestServer(address.host, port, conf, self, masterUrl)) + } + restServerBoundPort = restServer.map(_.start()) + masterMetricsSystem.registerSource(masterSource) masterMetricsSystem.start() applicationMetricsSystem.start() @@ -581,20 +581,22 @@ private[deploy] class Master( /** Return whether the specified worker can launch an executor for this app. */ def canLaunchExecutor(pos: Int): Boolean = { + val keepScheduling = coresToAssign >= minCoresPerExecutor + val enoughCores = usableWorkers(pos).coresFree - assignedCores(pos) >= minCoresPerExecutor + // If we allow multiple executors per worker, then we can always launch new executors. - // Otherwise, we may have already started assigning cores to the executor on this worker. + // Otherwise, if there is already an executor on this worker, just give it more cores. val launchingNewExecutor = !oneExecutorPerWorker || assignedExecutors(pos) == 0 - val underLimit = - if (launchingNewExecutor) { - assignedExecutors.sum + app.executors.size < app.executorLimit - } else { - true - } - val assignedMemory = assignedExecutors(pos) * memoryPerExecutor - usableWorkers(pos).memoryFree - assignedMemory >= memoryPerExecutor && - usableWorkers(pos).coresFree - assignedCores(pos) >= minCoresPerExecutor && - coresToAssign >= minCoresPerExecutor && - underLimit + if (launchingNewExecutor) { + val assignedMemory = assignedExecutors(pos) * memoryPerExecutor + val enoughMemory = usableWorkers(pos).memoryFree - assignedMemory >= memoryPerExecutor + val underLimit = assignedExecutors.sum + app.executors.size < app.executorLimit + keepScheduling && enoughCores && enoughMemory && underLimit + } else { + // We're adding cores to an existing executor, so no need + // to check memory and executor limits + keepScheduling && enoughCores + } } // Keep launching executors until no more workers can accommodate any diff --git a/core/src/main/scala/org/apache/spark/deploy/master/MasterMessages.scala b/core/src/main/scala/org/apache/spark/deploy/master/MasterMessages.scala index 68c937188b33..a952cee36eb4 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/MasterMessages.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/MasterMessages.scala @@ -38,5 +38,5 @@ private[master] object MasterMessages { case object BoundPortsRequest - case class BoundPortsResponse(actorPort: Int, webUIPort: Int, restPort: Option[Int]) + case class BoundPortsResponse(rpcEndpointPort: Int, webUIPort: Int, restPort: Option[Int]) } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperLeaderElectionAgent.scala b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperLeaderElectionAgent.scala index 6fdff86f66e0..d317206a614f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperLeaderElectionAgent.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperLeaderElectionAgent.scala @@ -22,7 +22,7 @@ import org.apache.curator.framework.CuratorFramework import org.apache.curator.framework.recipes.leader.{LeaderLatchListener, LeaderLatch} import org.apache.spark.deploy.SparkCuratorUtil -private[master] class ZooKeeperLeaderElectionAgent(val masterActor: LeaderElectable, +private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderElectable, conf: SparkConf) extends LeaderLatchListener with LeaderElectionAgent with Logging { val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election" @@ -73,10 +73,10 @@ private[master] class ZooKeeperLeaderElectionAgent(val masterActor: LeaderElecta private def updateLeadershipStatus(isLeader: Boolean) { if (isLeader && status == LeadershipStatus.NOT_LEADER) { status = LeadershipStatus.LEADER - masterActor.electedLeader() + masterInstance.electedLeader() } else if (!isLeader && status == LeadershipStatus.LEADER) { status = LeadershipStatus.NOT_LEADER - masterActor.revokedLeadership() + masterInstance.revokedLeadership() } } diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala index 29a504228557..ab3fea475c2a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala @@ -28,7 +28,7 @@ import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.{SecurityManager, SparkConf, Logging} import org.apache.spark.deploy.{ApplicationDescription, ExecutorState} import org.apache.spark.deploy.DeployMessages.ExecutorStateChanged -import org.apache.spark.util.Utils +import org.apache.spark.util.{ShutdownHookManager, Utils} import org.apache.spark.util.logging.FileAppender /** @@ -70,7 +70,8 @@ private[deploy] class ExecutorRunner( } workerThread.start() // Shutdown hook that kills actors on shutdown. - shutdownHook = Utils.addShutdownHook { () => killProcess(Some("Worker shutting down")) } + shutdownHook = ShutdownHookManager.addShutdownHook { () => + killProcess(Some("Worker shutting down")) } } /** @@ -102,7 +103,7 @@ private[deploy] class ExecutorRunner( workerThread = null state = ExecutorState.KILLED try { - Utils.removeShutdownHook(shutdownHook) + ShutdownHookManager.removeShutdownHook(shutdownHook) } catch { case e: IllegalStateException => None } diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index c82a7ccab54d..79b1536d9401 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -228,7 +228,7 @@ private[deploy] class Worker( /** * Re-register with the master because a network failure or a master failure has occurred. * If the re-registration attempt threshold is exceeded, the worker exits with error. - * Note that for thread-safety this should only be called from the actor. + * Note that for thread-safety this should only be called from the rpcEndpoint. */ private def reregisterWithMaster(): Unit = { Utils.tryOrExit { @@ -365,7 +365,8 @@ private[deploy] class Worker( if (connected) { sendToMaster(Heartbeat(workerId, self)) } case WorkDirCleanup => - // Spin up a separate thread (in a future) to do the dir cleanup; don't tie up worker actor + // Spin up a separate thread (in a future) to do the dir cleanup; don't tie up worker + // rpcEndpoint. // Copy ids so that it can be used in the cleanup thread. val appIds = executors.values.map(_.appId).toSet val cleanupFuture = concurrent.future { @@ -427,7 +428,9 @@ private[deploy] class Worker( // application finishes. val appLocalDirs = appDirectories.get(appId).getOrElse { Utils.getOrCreateLocalRootDirs(conf).map { dir => - Utils.createDirectory(dir, namePrefix = "executor").getAbsolutePath() + val appDir = Utils.createDirectory(dir, namePrefix = "executor") + Utils.chmod700(appDir) + appDir.getAbsolutePath() }.toSeq } appDirectories(appId) = appLocalDirs @@ -684,7 +687,7 @@ private[deploy] object Worker extends Logging { workerNumber: Option[Int] = None, conf: SparkConf = new SparkConf): RpcEnv = { - // The LocalSparkCluster runs multiple local sparkWorkerX actor systems + // The LocalSparkCluster runs multiple local sparkWorkerX RPC Environments val systemName = SYSTEM_NAME + workerNumber.map(_.toString).getOrElse("") val securityMgr = new SecurityManager(conf) val rpcEnv = RpcEnv.create(systemName, host, port, conf, securityMgr) diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala index fae5640b9a21..735c4f092715 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala @@ -43,7 +43,7 @@ private[spark] class WorkerWatcher(override val rpcEnv: RpcEnv, workerUrl: Strin private[deploy] def setTesting(testing: Boolean) = isTesting = testing private var isTesting = false - // Lets us filter events only from the worker's actor system + // Lets filter events only from the worker's rpc system private val expectedAddress = RpcAddress.fromURIString(workerUrl) private def isWorker(address: RpcAddress) = expectedAddress == address @@ -62,7 +62,7 @@ private[spark] class WorkerWatcher(override val rpcEnv: RpcEnv, workerUrl: Strin override def onDisconnected(remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { // This log message will never be seen - logError(s"Lost connection to worker actor $workerUrl. Exiting.") + logError(s"Lost connection to worker rpc endpoint $workerUrl. Exiting.") exitNonZero() } } diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 7bc7fce7ae8d..42a85e42ea2b 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -17,7 +17,7 @@ package org.apache.spark.executor -import java.io.File +import java.io.{File, NotSerializableException} import java.lang.management.ManagementFactory import java.net.URL import java.nio.ByteBuffer @@ -249,6 +249,7 @@ private[spark] class Executor( m.setExecutorRunTime((taskFinish - taskStart) - task.executorDeserializeTime) m.setJvmGCTime(computeTotalGcTime() - startGCTime) m.setResultSerializationTime(afterSerialization - beforeSerialization) + m.updateAccumulators() } val directResult = new DirectTaskResult(valueBytes, accumUpdates, task.metrics.orNull) @@ -300,11 +301,20 @@ private[spark] class Executor( task.metrics.map { m => m.setExecutorRunTime(System.currentTimeMillis() - taskStart) m.setJvmGCTime(computeTotalGcTime() - startGCTime) + m.updateAccumulators() m } } - val taskEndReason = new ExceptionFailure(t, metrics) - execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(taskEndReason)) + val serializedTaskEndReason = { + try { + ser.serialize(new ExceptionFailure(t, metrics)) + } catch { + case _: NotSerializableException => + // t is not serializable so just send the stacktrace + ser.serialize(new ExceptionFailure(t, metrics, false)) + } + } + execBackend.statusUpdate(taskId, TaskState.FAILED, serializedTaskEndReason) // Don't forcibly exit unless the exception was inherently fatal, to avoid // stopping other tasks unnecessarily. diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala index 607d5a321efc..9dc36704a676 100644 --- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala +++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala @@ -148,7 +148,7 @@ class SnappyCompressionCodec(conf: SparkConf) extends CompressionCodec { try { Snappy.getNativeLibraryVersion } catch { - case e: Error => throw new IllegalArgumentException + case e: Error => throw new IllegalArgumentException(e) } override def compressedOutputStream(s: OutputStream): OutputStream = { diff --git a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala index 87df42748be4..f405b732e472 100644 --- a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala +++ b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala @@ -24,6 +24,7 @@ import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{TaskAttemptContext => MapReduceTaskAttemptContext} import org.apache.hadoop.mapreduce.{OutputCommitter => MapReduceOutputCommitter} +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.executor.CommitDeniedException import org.apache.spark.{Logging, SparkEnv, TaskContext} import org.apache.spark.util.{Utils => SparkUtils} @@ -93,7 +94,7 @@ object SparkHadoopMapRedUtil extends Logging { splitId: Int, attemptId: Int): Unit = { - val mrTaskAttemptID = mrTaskContext.getTaskAttemptID + val mrTaskAttemptID = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(mrTaskContext) // Called after we have decided to commit def performCommit(): Unit = { diff --git a/core/src/main/scala/org/apache/spark/package.scala b/core/src/main/scala/org/apache/spark/package.scala index 8ae76c5f72f2..5e3b75bde690 100644 --- a/core/src/main/scala/org/apache/spark/package.scala +++ b/core/src/main/scala/org/apache/spark/package.scala @@ -42,6 +42,5 @@ package org.apache */ package object spark { - // For package docs only - val SPARK_VERSION = "1.5.0-SNAPSHOT" + val SPARK_VERSION = "1.5.0" } diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala index 130b58882d8e..9c617fc719cb 100644 --- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala @@ -23,8 +23,7 @@ import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer -import org.apache.spark.{InterruptibleIterator, Partition, Partitioner, SparkEnv, TaskContext} -import org.apache.spark.{Dependency, OneToOneDependency, ShuffleDependency} +import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.collection.{ExternalAppendOnlyMap, AppendOnlyMap, CompactBuffer} import org.apache.spark.util.Utils @@ -169,8 +168,10 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part: for ((it, depNum) <- rddIterators) { map.insertAll(it.map(pair => (pair._1, new CoGroupValue(pair._2, depNum)))) } - context.taskMetrics.incMemoryBytesSpilled(map.memoryBytesSpilled) - context.taskMetrics.incDiskBytesSpilled(map.diskBytesSpilled) + context.taskMetrics().incMemoryBytesSpilled(map.memoryBytesSpilled) + context.taskMetrics().incDiskBytesSpilled(map.diskBytesSpilled) + context.internalMetricsToAccumulators( + InternalAccumulator.PEAK_EXECUTION_MEMORY).add(map.peakMemoryUsedBytes) new InterruptibleIterator(context, map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]]) } diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index f1c17369cb48..e1f8719eead0 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -44,7 +44,7 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.executor.DataReadMethod import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD -import org.apache.spark.util.{SerializableConfiguration, NextIterator, Utils} +import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager, NextIterator, Utils} import org.apache.spark.scheduler.{HostTaskLocation, HDFSCacheTaskLocation} import org.apache.spark.storage.StorageLevel @@ -274,7 +274,7 @@ class HadoopRDD[K, V]( } } catch { case e: Exception => { - if (!Utils.inShutdown()) { + if (!ShutdownHookManager.inShutdown()) { logWarning("Exception in RecordReader.close()", e) } } diff --git a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsRDD.scala index a838aac6e8d1..4312d3a41775 100644 --- a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsRDD.scala @@ -21,6 +21,9 @@ import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} +/** + * An RDD that applies the provided function to every partition of the parent RDD. + */ private[spark] class MapPartitionsRDD[U: ClassTag, T: ClassTag]( prev: RDD[T], f: (TaskContext, Int, Iterator[T]) => Iterator[U], // (TaskContext, partition index, iterator) diff --git a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala new file mode 100644 index 000000000000..417ff5278db2 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.rdd + +import scala.collection.mutable.ArrayBuffer +import scala.reflect.ClassTag + +import org.apache.spark.{Partition, Partitioner, TaskContext} + +/** + * An RDD that applies a user provided function to every partition of the parent RDD, and + * additionally allows the user to prepare each partition before computing the parent partition. + */ +private[spark] class MapPartitionsWithPreparationRDD[U: ClassTag, T: ClassTag, M: ClassTag]( + prev: RDD[T], + preparePartition: () => M, + executePartition: (TaskContext, Int, M, Iterator[T]) => Iterator[U], + preservesPartitioning: Boolean = false) + extends RDD[U](prev) { + + override val partitioner: Option[Partitioner] = { + if (preservesPartitioning) firstParent[T].partitioner else None + } + + override def getPartitions: Array[Partition] = firstParent[T].partitions + + // In certain join operations, prepare can be called on the same partition multiple times. + // In this case, we need to ensure that each call to compute gets a separate prepare argument. + private[this] val preparedArguments: ArrayBuffer[M] = new ArrayBuffer[M] + + /** + * Prepare a partition for a single call to compute. + */ + def prepare(): Unit = { + preparedArguments += preparePartition() + } + + /** + * Prepare a partition before computing it from its parent. + */ + override def compute(partition: Partition, context: TaskContext): Iterator[U] = { + val prepared = + if (preparedArguments.isEmpty) { + preparePartition() + } else { + preparedArguments.remove(0) + } + val parentIterator = firstParent[T].iterator(partition, context) + executePartition(context, partition.index, prepared, parentIterator) + } +} diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala index f83a051f5da1..6a9c004d65cf 100644 --- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala @@ -33,7 +33,7 @@ import org.apache.spark._ import org.apache.spark.executor.DataReadMethod import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD -import org.apache.spark.util.{SerializableConfiguration, Utils} +import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager, Utils} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.storage.StorageLevel @@ -186,7 +186,7 @@ class NewHadoopRDD[K, V]( } } catch { case e: Exception => { - if (!Utils.inShutdown()) { + if (!ShutdownHookManager.inShutdown()) { logWarning("Exception in RecordReader.close()", e) } } diff --git a/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala index 35e44cb59c1b..fa3fecc80cb6 100644 --- a/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala @@ -26,16 +26,14 @@ import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, FileSplit} -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.executor.DataReadMethod import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.{Partition => SparkPartition, _} -import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD import org.apache.spark.storage.StorageLevel -import org.apache.spark.util.{SerializableConfiguration, Utils} +import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager, Utils} private[spark] class SqlNewHadoopPartition( @@ -60,18 +58,16 @@ private[spark] class SqlNewHadoopPartition( * and the executor side to the shared Hadoop Configuration. * * Note: This is RDD is basically a cloned version of [[org.apache.spark.rdd.NewHadoopRDD]] with - * changes based on [[org.apache.spark.rdd.HadoopRDD]]. In future, this functionality will be - * folded into core. + * changes based on [[org.apache.spark.rdd.HadoopRDD]]. */ -private[spark] class SqlNewHadoopRDD[K, V]( +private[spark] class SqlNewHadoopRDD[V: ClassTag]( @transient sc : SparkContext, broadcastedConf: Broadcast[SerializableConfiguration], @transient initDriverSideJobFuncOpt: Option[Job => Unit], initLocalJobFuncOpt: Option[Job => Unit], - inputFormatClass: Class[_ <: InputFormat[K, V]], - keyClass: Class[K], + inputFormatClass: Class[_ <: InputFormat[Void, V]], valueClass: Class[V]) - extends RDD[(K, V)](sc, Nil) + extends RDD[V](sc, Nil) with SparkHadoopMapReduceUtil with Logging { @@ -120,8 +116,8 @@ private[spark] class SqlNewHadoopRDD[K, V]( override def compute( theSplit: SparkPartition, - context: TaskContext): InterruptibleIterator[(K, V)] = { - val iter = new Iterator[(K, V)] { + context: TaskContext): Iterator[V] = { + val iter = new Iterator[V] { val split = theSplit.asInstanceOf[SqlNewHadoopPartition] logInfo("Input split: " + split.serializableHadoopSplit) val conf = getConf(isDriverSide = false) @@ -154,17 +150,20 @@ private[spark] class SqlNewHadoopRDD[K, V]( configurable.setConf(conf) case _ => } - private var reader = format.createRecordReader( + private[this] var reader = format.createRecordReader( split.serializableHadoopSplit.value, hadoopAttemptContext) reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext) // Register an on-task-completion callback to close the input stream. context.addTaskCompletionListener(context => close()) - var havePair = false - var finished = false - var recordsSinceMetricsUpdate = 0 + + private[this] var havePair = false + private[this] var finished = false override def hasNext: Boolean = { + if (context.isInterrupted) { + throw new TaskKilledException + } if (!finished && !havePair) { finished = !reader.nextKeyValue if (finished) { @@ -178,7 +177,7 @@ private[spark] class SqlNewHadoopRDD[K, V]( !finished } - override def next(): (K, V) = { + override def next(): V = { if (!hasNext) { throw new java.util.NoSuchElementException("End of stream") } @@ -186,7 +185,7 @@ private[spark] class SqlNewHadoopRDD[K, V]( if (!finished) { inputMetrics.incRecordsRead(1) } - (reader.getCurrentKey, reader.getCurrentValue) + reader.getCurrentValue } private def close() { @@ -212,23 +211,14 @@ private[spark] class SqlNewHadoopRDD[K, V]( } } } catch { - case e: Exception => { - if (!Utils.inShutdown()) { + case e: Exception => + if (!ShutdownHookManager.inShutdown()) { logWarning("Exception in RecordReader.close()", e) } - } } } } - new InterruptibleIterator(context, iter) - } - - /** Maps over a partition, providing the InputSplit that was used as the base of the partition. */ - @DeveloperApi - def mapPartitionsWithInputSplit[U: ClassTag]( - f: (InputSplit, Iterator[(K, V)]) => Iterator[U], - preservesPartitioning: Boolean = false): RDD[U] = { - new NewHadoopMapPartitionsWithSplitRDD(this, f, preservesPartitioning) + iter } override def getPreferredLocations(hsplit: SparkPartition): Seq[String] = { diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala index 81f40ad33aa5..b3c64394abc7 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala @@ -73,6 +73,16 @@ private[spark] abstract class ZippedPartitionsBaseRDD[V: ClassTag]( super.clearDependencies() rdds = null } + + /** + * Call the prepare method of every parent that has one. + * This is needed for reserving execution memory in advance. + */ + protected def tryPrepareParents(): Unit = { + rdds.collect { + case rdd: MapPartitionsWithPreparationRDD[_, _, _] => rdd.prepare() + } + } } private[spark] class ZippedPartitionsRDD2[A: ClassTag, B: ClassTag, V: ClassTag]( @@ -84,6 +94,7 @@ private[spark] class ZippedPartitionsRDD2[A: ClassTag, B: ClassTag, V: ClassTag] extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2), preservesPartitioning) { override def compute(s: Partition, context: TaskContext): Iterator[V] = { + tryPrepareParents() val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions f(rdd1.iterator(partitions(0), context), rdd2.iterator(partitions(1), context)) } @@ -107,6 +118,7 @@ private[spark] class ZippedPartitionsRDD3 extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3), preservesPartitioning) { override def compute(s: Partition, context: TaskContext): Iterator[V] = { + tryPrepareParents() val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions f(rdd1.iterator(partitions(0), context), rdd2.iterator(partitions(1), context), @@ -134,6 +146,7 @@ private[spark] class ZippedPartitionsRDD4 extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3, rdd4), preservesPartitioning) { override def compute(s: Partition, context: TaskContext): Iterator[V] = { + tryPrepareParents() val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions f(rdd1.iterator(partitions(0), context), rdd2.iterator(partitions(1), context), diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala index 6ae47894598b..7409ac885999 100644 --- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala +++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala @@ -100,7 +100,7 @@ private[spark] abstract class RpcEndpointRef(@transient conf: SparkConf) val future = ask[T](message, timeout) val result = timeout.awaitResult(future) if (result == null) { - throw new SparkException("Actor returned null") + throw new SparkException("RpcEndpoint returned null") } return result } catch { diff --git a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala index e0edd7d4ae96..11d123eec43c 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala @@ -24,11 +24,12 @@ import org.apache.spark.annotation.DeveloperApi * Information about an [[org.apache.spark.Accumulable]] modified during a task or stage. */ @DeveloperApi -class AccumulableInfo ( +class AccumulableInfo private[spark] ( val id: Long, val name: String, val update: Option[String], // represents a partial update within a task - val value: String) { + val value: String, + val internal: Boolean) { override def equals(other: Any): Boolean = other match { case acc: AccumulableInfo => @@ -40,10 +41,10 @@ class AccumulableInfo ( object AccumulableInfo { def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = { - new AccumulableInfo(id, name, update, value) + new AccumulableInfo(id, name, update, value, internal = false) } def apply(id: Long, name: String, value: String): AccumulableInfo = { - new AccumulableInfo(id, name, None, value) + new AccumulableInfo(id, name, None, value, internal = false) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index c4fa277c2125..de69911f4bfb 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -111,7 +111,7 @@ class DAGScheduler( * * All accesses to this map should be guarded by synchronizing on it (see SPARK-4454). */ - private val cacheLocs = new HashMap[Int, Seq[Seq[TaskLocation]]] + private val cacheLocs = new HashMap[Int, IndexedSeq[Seq[TaskLocation]]] // For tracking failed nodes, we use the MapOutputTracker's epoch number, which is sent with // every task. When we detect a node failing, we note the current epoch number and failed @@ -138,7 +138,7 @@ class DAGScheduler( // Flag to control if reduce tasks are assigned preferred locations private val shuffleLocalityEnabled = - sc.getConf.getBoolean("spark.shuffle.reduceLocality.enabled", true) + sc.getConf.getBoolean("spark.shuffle.reduceLocality.enabled", false) // Number of map, reduce tasks above which we do not assign preferred locations // based on map output sizes. We limit the size of jobs for which assign preferred locations // as computing the top locations by size becomes expensive. @@ -152,17 +152,24 @@ class DAGScheduler( // may lead to more delay in scheduling if those locations are busy. private[scheduler] val REDUCER_PREF_LOCS_FRACTION = 0.2 - // Called by TaskScheduler to report task's starting. + /** + * Called by the TaskSetManager to report task's starting. + */ def taskStarted(task: Task[_], taskInfo: TaskInfo) { eventProcessLoop.post(BeginEvent(task, taskInfo)) } - // Called to report that a task has completed and results are being fetched remotely. + /** + * Called by the TaskSetManager to report that a task has completed + * and results are being fetched remotely. + */ def taskGettingResult(taskInfo: TaskInfo) { eventProcessLoop.post(GettingResultEvent(taskInfo)) } - // Called by TaskScheduler to report task completions or failures. + /** + * Called by the TaskSetManager to report task completions or failures. + */ def taskEnded( task: Task[_], reason: TaskEndReason, @@ -188,29 +195,35 @@ class DAGScheduler( BlockManagerHeartbeat(blockManagerId), new RpcTimeout(600 seconds, "BlockManagerHeartbeat")) } - // Called by TaskScheduler when an executor fails. + /** + * Called by TaskScheduler implementation when an executor fails. + */ def executorLost(execId: String): Unit = { eventProcessLoop.post(ExecutorLost(execId)) } - // Called by TaskScheduler when a host is added + /** + * Called by TaskScheduler implementation when a host is added. + */ def executorAdded(execId: String, host: String): Unit = { eventProcessLoop.post(ExecutorAdded(execId, host)) } - // Called by TaskScheduler to cancel an entire TaskSet due to either repeated failures or - // cancellation of the job itself. - def taskSetFailed(taskSet: TaskSet, reason: String): Unit = { - eventProcessLoop.post(TaskSetFailed(taskSet, reason)) + /** + * Called by the TaskSetManager to cancel an entire TaskSet due to either repeated failures or + * cancellation of the job itself. + */ + def taskSetFailed(taskSet: TaskSet, reason: String, exception: Option[Throwable]): Unit = { + eventProcessLoop.post(TaskSetFailed(taskSet, reason, exception)) } private[scheduler] - def getCacheLocs(rdd: RDD[_]): Seq[Seq[TaskLocation]] = cacheLocs.synchronized { + def getCacheLocs(rdd: RDD[_]): IndexedSeq[Seq[TaskLocation]] = cacheLocs.synchronized { // Note: this doesn't use `getOrElse()` because this method is called O(num tasks) times if (!cacheLocs.contains(rdd.id)) { // Note: if the storage level is NONE, we don't need to get locations from block manager. - val locs: Seq[Seq[TaskLocation]] = if (rdd.getStorageLevel == StorageLevel.NONE) { - Seq.fill(rdd.partitions.size)(Nil) + val locs: IndexedSeq[Seq[TaskLocation]] = if (rdd.getStorageLevel == StorageLevel.NONE) { + IndexedSeq.fill(rdd.partitions.length)(Nil) } else { val blockIds = rdd.partitions.indices.map(index => RDDBlockId(rdd.id, index)).toArray[BlockId] @@ -302,12 +315,12 @@ class DAGScheduler( shuffleDep: ShuffleDependency[_, _, _], firstJobId: Int): ShuffleMapStage = { val rdd = shuffleDep.rdd - val numTasks = rdd.partitions.size + val numTasks = rdd.partitions.length val stage = newShuffleMapStage(rdd, numTasks, shuffleDep, firstJobId, rdd.creationSite) if (mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) { val serLocs = mapOutputTracker.getSerializedMapOutputStatuses(shuffleDep.shuffleId) val locs = MapOutputTracker.deserializeMapStatuses(serLocs) - for (i <- 0 until locs.size) { + for (i <- 0 until locs.length) { stage.outputLocs(i) = Option(locs(i)).toList // locs(i) will be null if missing } stage.numAvailableOutputs = locs.count(_ != null) @@ -315,7 +328,7 @@ class DAGScheduler( // Kind of ugly: need to register RDDs with the cache and map output tracker here // since we can't do it in the RDD constructor because # of partitions is unknown logInfo("Registering RDD " + rdd.id + " (" + rdd.getCreationSite + ")") - mapOutputTracker.registerShuffle(shuffleDep.shuffleId, rdd.partitions.size) + mapOutputTracker.registerShuffle(shuffleDep.shuffleId, rdd.partitions.length) } stage } @@ -566,7 +579,7 @@ class DAGScheduler( properties: Properties): PartialResult[R] = { val listener = new ApproximateActionListener(rdd, func, evaluator, timeout) val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _] - val partitions = (0 until rdd.partitions.size).toArray + val partitions = (0 until rdd.partitions.length).toArray val jobId = nextJobId.getAndIncrement() eventProcessLoop.post(JobSubmitted( jobId, rdd, func2, partitions, callSite, listener, SerializationUtils.clone(properties))) @@ -677,8 +690,11 @@ class DAGScheduler( submitWaitingStages() } - private[scheduler] def handleTaskSetFailed(taskSet: TaskSet, reason: String) { - stageIdToStage.get(taskSet.stageId).foreach {abortStage(_, reason) } + private[scheduler] def handleTaskSetFailed( + taskSet: TaskSet, + reason: String, + exception: Option[Throwable]): Unit = { + stageIdToStage.get(taskSet.stageId).foreach { abortStage(_, reason, exception) } submitWaitingStages() } @@ -715,7 +731,7 @@ class DAGScheduler( try { // New stage creation may throw an exception if, for example, jobs are run on a // HadoopRDD whose underlying HDFS files have been deleted. - finalStage = newResultStage(finalRDD, partitions.size, jobId, callSite) + finalStage = newResultStage(finalRDD, partitions.length, jobId, callSite) } catch { case e: Exception => logWarning("Creating new stage failed due to exception - job: " + jobId, e) @@ -762,7 +778,7 @@ class DAGScheduler( } } } else { - abortStage(stage, "No active job for stage " + stage.id) + abortStage(stage, "No active job for stage " + stage.id, None) } } @@ -773,16 +789,27 @@ class DAGScheduler( stage.pendingTasks.clear() // First figure out the indexes of partition ids to compute. - val partitionsToCompute: Seq[Int] = { + val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = { stage match { case stage: ShuffleMapStage => - (0 until stage.numPartitions).filter(id => stage.outputLocs(id).isEmpty) + val allPartitions = 0 until stage.numPartitions + val filteredPartitions = allPartitions.filter { id => stage.outputLocs(id).isEmpty } + (allPartitions, filteredPartitions) case stage: ResultStage => val job = stage.resultOfJob.get - (0 until job.numPartitions).filter(id => !job.finished(id)) + val allPartitions = 0 until job.numPartitions + val filteredPartitions = allPartitions.filter { id => !job.finished(id) } + (allPartitions, filteredPartitions) } } + // Create internal accumulators if the stage has no accumulators initialized. + // Reset internal accumulators only if this stage is not partially submitted + // Otherwise, we may override existing accumulator values from some tasks + if (stage.internalAccumulators.isEmpty || allPartitions == partitionsToCompute) { + stage.resetInternalAccumulators() + } + val properties = jobIdToActiveJob.get(stage.firstJobId).map(_.properties).orNull runningStages += stage @@ -806,7 +833,7 @@ class DAGScheduler( case NonFatal(e) => stage.makeNewStageAttempt(partitionsToCompute.size) listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties)) - abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}") + abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e)) runningStages -= stage return } @@ -835,13 +862,13 @@ class DAGScheduler( } catch { // In the case of a failure during serialization, abort the stage. case e: NotSerializableException => - abortStage(stage, "Task not serializable: " + e.toString) + abortStage(stage, "Task not serializable: " + e.toString, Some(e)) runningStages -= stage // Abort execution return case NonFatal(e) => - abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}") + abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}", Some(e)) runningStages -= stage return } @@ -852,7 +879,8 @@ class DAGScheduler( partitionsToCompute.map { id => val locs = taskIdToLocations(id) val part = stage.rdd.partitions(id) - new ShuffleMapTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs) + new ShuffleMapTask(stage.id, stage.latestInfo.attemptId, + taskBinary, part, locs, stage.internalAccumulators) } case stage: ResultStage => @@ -861,12 +889,13 @@ class DAGScheduler( val p: Int = job.partitions(id) val part = stage.rdd.partitions(p) val locs = taskIdToLocations(id) - new ResultTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs, id) + new ResultTask(stage.id, stage.latestInfo.attemptId, + taskBinary, part, locs, id, stage.internalAccumulators) } } } catch { case NonFatal(e) => - abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}") + abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e)) runningStages -= stage return } @@ -916,9 +945,11 @@ class DAGScheduler( // To avoid UI cruft, ignore cases where value wasn't updated if (acc.name.isDefined && partialValue != acc.zero) { val name = acc.name.get - stage.latestInfo.accumulables(id) = AccumulableInfo(id, name, s"${acc.value}") + val value = s"${acc.value}" + stage.latestInfo.accumulables(id) = + new AccumulableInfo(id, name, None, value, acc.isInternal) event.taskInfo.accumulables += - AccumulableInfo(id, name, Some(s"$partialValue"), s"${acc.value}") + new AccumulableInfo(id, name, Some(s"$partialValue"), value, acc.isInternal) } } } catch { @@ -1021,7 +1052,7 @@ class DAGScheduler( // we registered these map outputs. mapOutputTracker.registerMapOutputs( shuffleStage.shuffleDep.shuffleId, - shuffleStage.outputLocs.map(list => if (list.isEmpty) null else list.head).toArray, + shuffleStage.outputLocs.map(list => if (list.isEmpty) null else list.head), changeEpoch = true) clearCacheLocs() @@ -1084,7 +1115,8 @@ class DAGScheduler( } if (disallowStageRetryForTest) { - abortStage(failedStage, "Fetch failure will not retry stage due to testing config") + abortStage(failedStage, "Fetch failure will not retry stage due to testing config", + None) } else if (failedStages.isEmpty) { // Don't schedule an event to resubmit failed stages if failed isn't empty, because // in that case the event will already have been scheduled. @@ -1112,7 +1144,7 @@ class DAGScheduler( case commitDenied: TaskCommitDenied => // Do nothing here, left up to the TaskScheduler to decide how to handle denied commits - case ExceptionFailure(className, description, stackTrace, fullStackTrace, metrics) => + case exceptionFailure: ExceptionFailure => // Do nothing here, left up to the TaskScheduler to decide how to handle user failures case TaskResultLost => @@ -1150,7 +1182,7 @@ class DAGScheduler( // TODO: This will be really slow if we keep accumulating shuffle map stages for ((shuffleId, stage) <- shuffleToMapStage) { stage.removeOutputsOnExecutor(execId) - val locs = stage.outputLocs.map(list => if (list.isEmpty) null else list.head).toArray + val locs = stage.outputLocs.map(list => if (list.isEmpty) null else list.head) mapOutputTracker.registerMapOutputs(shuffleId, locs, changeEpoch = true) } if (shuffleToMapStage.isEmpty) { @@ -1221,7 +1253,10 @@ class DAGScheduler( * Aborts all jobs depending on a particular Stage. This is called in response to a task set * being canceled by the TaskScheduler. Use taskSetFailed() to inject this event from outside. */ - private[scheduler] def abortStage(failedStage: Stage, reason: String) { + private[scheduler] def abortStage( + failedStage: Stage, + reason: String, + exception: Option[Throwable]): Unit = { if (!stageIdToStage.contains(failedStage.id)) { // Skip all the actions if the stage has been removed. return @@ -1230,7 +1265,7 @@ class DAGScheduler( activeJobs.filter(job => stageDependsOn(job.finalStage, failedStage)).toSeq failedStage.latestInfo.completionTime = Some(clock.getTimeMillis()) for (job <- dependentJobs) { - failJobAndIndependentStages(job, s"Job aborted due to stage failure: $reason") + failJobAndIndependentStages(job, s"Job aborted due to stage failure: $reason", exception) } if (dependentJobs.isEmpty) { logInfo("Ignoring failure of " + failedStage + " because all jobs depending on it are done") @@ -1238,8 +1273,11 @@ class DAGScheduler( } /** Fails a job and all stages that are only used by that job, and cleans up relevant state. */ - private def failJobAndIndependentStages(job: ActiveJob, failureReason: String) { - val error = new SparkException(failureReason) + private def failJobAndIndependentStages( + job: ActiveJob, + failureReason: String, + exception: Option[Throwable] = None): Unit = { + val error = new SparkException(failureReason, exception.getOrElse(null)) var ableToCancelStages = true val shouldInterruptThread = @@ -1358,33 +1396,36 @@ class DAGScheduler( return rddPrefs.map(TaskLocation(_)) } + // If the RDD has narrow dependencies, pick the first partition of the first narrow dependency + // that has any placement preferences. Ideally we would choose based on transfer sizes, + // but this will do for now. rdd.dependencies.foreach { case n: NarrowDependency[_] => - // If the RDD has narrow dependencies, pick the first partition of the first narrow dep - // that has any placement preferences. Ideally we would choose based on transfer sizes, - // but this will do for now. for (inPart <- n.getParents(partition)) { val locs = getPreferredLocsInternal(n.rdd, inPart, visited) if (locs != Nil) { return locs } } - case s: ShuffleDependency[_, _, _] => - // For shuffle dependencies, pick locations which have at least REDUCER_PREF_LOCS_FRACTION - // of data as preferred locations - if (shuffleLocalityEnabled && - rdd.partitions.size < SHUFFLE_PREF_REDUCE_THRESHOLD && - s.rdd.partitions.size < SHUFFLE_PREF_MAP_THRESHOLD) { - // Get the preferred map output locations for this reducer - val topLocsForReducer = mapOutputTracker.getLocationsWithLargestOutputs(s.shuffleId, - partition, rdd.partitions.size, REDUCER_PREF_LOCS_FRACTION) - if (topLocsForReducer.nonEmpty) { - return topLocsForReducer.get.map(loc => TaskLocation(loc.host, loc.executorId)) - } - } - case _ => } + + // If the RDD has shuffle dependencies and shuffle locality is enabled, pick locations that + // have at least REDUCER_PREF_LOCS_FRACTION of data as preferred locations + if (shuffleLocalityEnabled && rdd.partitions.length < SHUFFLE_PREF_REDUCE_THRESHOLD) { + rdd.dependencies.foreach { + case s: ShuffleDependency[_, _, _] => + if (s.rdd.partitions.length < SHUFFLE_PREF_MAP_THRESHOLD) { + // Get the preferred map output locations for this reducer + val topLocsForReducer = mapOutputTracker.getLocationsWithLargestOutputs(s.shuffleId, + partition, rdd.partitions.length, REDUCER_PREF_LOCS_FRACTION) + if (topLocsForReducer.nonEmpty) { + return topLocsForReducer.get.map(loc => TaskLocation(loc.host, loc.executorId)) + } + } + case _ => + } + } Nil } @@ -1448,8 +1489,8 @@ private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler case completion @ CompletionEvent(task, reason, _, _, taskInfo, taskMetrics) => dagScheduler.handleTaskCompletion(completion) - case TaskSetFailed(taskSet, reason) => - dagScheduler.handleTaskSetFailed(taskSet, reason) + case TaskSetFailed(taskSet, reason, exception) => + dagScheduler.handleTaskSetFailed(taskSet, reason, exception) case ResubmitFailedStages => dagScheduler.resubmitFailedStages() diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala index a213d419cf03..f72a52e85dc1 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala @@ -73,6 +73,7 @@ private[scheduler] case class ExecutorAdded(execId: String, host: String) extend private[scheduler] case class ExecutorLost(execId: String) extends DAGSchedulerEvent private[scheduler] -case class TaskSetFailed(taskSet: TaskSet, reason: String) extends DAGSchedulerEvent +case class TaskSetFailed(taskSet: TaskSet, reason: String, exception: Option[Throwable]) + extends DAGSchedulerEvent private[scheduler] case object ResubmitFailedStages extends DAGSchedulerEvent diff --git a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala index 8321037cdc02..5d926377ce86 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala @@ -162,7 +162,7 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean) private[spark] object OutputCommitCoordinator { - // This actor is used only for RPC + // This endpoint is used only for RPC private[spark] class OutputCommitCoordinatorEndpoint( override val rpcEnv: RpcEnv, outputCommitCoordinator: OutputCommitCoordinator) extends RpcEndpoint with Logging { diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala index 9c2606e278c5..c4dc080e2b22 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala @@ -45,8 +45,10 @@ private[spark] class ResultTask[T, U]( taskBinary: Broadcast[Array[Byte]], partition: Partition, @transient locs: Seq[TaskLocation], - val outputId: Int) - extends Task[U](stageId, stageAttemptId, partition.index) with Serializable { + val outputId: Int, + internalAccumulators: Seq[Accumulator[Long]]) + extends Task[U](stageId, stageAttemptId, partition.index, internalAccumulators) + with Serializable { @transient private[this] val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala index 14c8c0096148..f478f9982afe 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala @@ -43,12 +43,14 @@ private[spark] class ShuffleMapTask( stageAttemptId: Int, taskBinary: Broadcast[Array[Byte]], partition: Partition, - @transient private var locs: Seq[TaskLocation]) - extends Task[MapStatus](stageId, stageAttemptId, partition.index) with Logging { + @transient private var locs: Seq[TaskLocation], + internalAccumulators: Seq[Accumulator[Long]]) + extends Task[MapStatus](stageId, stageAttemptId, partition.index, internalAccumulators) + with Logging { /** A constructor used only in test suites. This does not require passing in an RDD. */ def this(partitionId: Int) { - this(0, 0, null, new Partition { override def index: Int = 0 }, null) + this(0, 0, null, new Partition { override def index: Int = 0 }, null, null) } @transient private val preferredLocs: Seq[TaskLocation] = { @@ -69,7 +71,7 @@ private[spark] class ShuffleMapTask( val manager = SparkEnv.get.shuffleManager writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context) writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]]) - return writer.stop(success = true).get + writer.stop(success = true).get } catch { case e: Exception => try { diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala index 40a333a3e06b..1cf06856ffbc 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala @@ -68,6 +68,22 @@ private[spark] abstract class Stage( val name = callSite.shortForm val details = callSite.longForm + private var _internalAccumulators: Seq[Accumulator[Long]] = Seq.empty + + /** Internal accumulators shared across all tasks in this stage. */ + def internalAccumulators: Seq[Accumulator[Long]] = _internalAccumulators + + /** + * Re-initialize the internal accumulators associated with this stage. + * + * This is called every time the stage is submitted, *except* when a subset of tasks + * belonging to this stage has already finished. Otherwise, reinitializing the internal + * accumulators here again will override partial values from the finished tasks. + */ + def resetInternalAccumulators(): Unit = { + _internalAccumulators = InternalAccumulator.create(rdd.sparkContext) + } + /** * Pointer to the [StageInfo] object for the most recent attempt. This needs to be initialized * here, before any attempts have actually been created, because the DAGScheduler uses this diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala index 1978305cfefb..9edf9f048f9f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala @@ -23,7 +23,7 @@ import java.nio.ByteBuffer import scala.collection.mutable.HashMap import org.apache.spark.metrics.MetricsSystem -import org.apache.spark.{SparkEnv, TaskContextImpl, TaskContext} +import org.apache.spark.{Accumulator, SparkEnv, TaskContextImpl, TaskContext} import org.apache.spark.executor.TaskMetrics import org.apache.spark.serializer.SerializerInstance import org.apache.spark.unsafe.memory.TaskMemoryManager @@ -47,7 +47,8 @@ import org.apache.spark.util.Utils private[spark] abstract class Task[T]( val stageId: Int, val stageAttemptId: Int, - var partitionId: Int) extends Serializable { + val partitionId: Int, + internalAccumulators: Seq[Accumulator[Long]]) extends Serializable { /** * The key of the Map is the accumulator id and the value of the Map is the latest accumulator @@ -68,12 +69,13 @@ private[spark] abstract class Task[T]( metricsSystem: MetricsSystem) : (T, AccumulatorUpdates) = { context = new TaskContextImpl( - stageId = stageId, - partitionId = partitionId, - taskAttemptId = taskAttemptId, - attemptNumber = attemptNumber, - taskMemoryManager = taskMemoryManager, - metricsSystem = metricsSystem, + stageId, + partitionId, + taskAttemptId, + attemptNumber, + taskMemoryManager, + metricsSystem, + internalAccumulators, runningLocally = false) TaskContext.setTaskContext(context) context.taskMetrics.setHostname(Utils.localHostName()) diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 82455b0426a5..818b95d67f6b 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -662,7 +662,7 @@ private[spark] class TaskSetManager( val failureReason = s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid, ${info.host}): " + reason.asInstanceOf[TaskFailedReason].toErrorString - reason match { + val failureException: Option[Throwable] = reason match { case fetchFailed: FetchFailed => logWarning(failureReason) if (!successful(index)) { @@ -671,6 +671,7 @@ private[spark] class TaskSetManager( } // Not adding to failed executors for FetchFailed. isZombie = true + None case ef: ExceptionFailure => taskMetrics = ef.metrics.orNull @@ -706,12 +707,15 @@ private[spark] class TaskSetManager( s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid) on executor ${info.host}: " + s"${ef.className} (${ef.description}) [duplicate $dupCount]") } + ef.exception case e: TaskFailedReason => // TaskResultLost, TaskKilled, and others logWarning(failureReason) + None case e: TaskEndReason => logError("Unknown TaskEndReason: " + e) + None } // always add to failed executors failedExecutors.getOrElseUpdate(index, new HashMap[String, Long]()). @@ -728,16 +732,16 @@ private[spark] class TaskSetManager( logError("Task %d in stage %s failed %d times; aborting job".format( index, taskSet.id, maxTaskFailures)) abort("Task %d in stage %s failed %d times, most recent failure: %s\nDriver stacktrace:" - .format(index, taskSet.id, maxTaskFailures, failureReason)) + .format(index, taskSet.id, maxTaskFailures, failureReason), failureException) return } } maybeFinishTaskSet() } - def abort(message: String): Unit = sched.synchronized { + def abort(message: String, exception: Option[Throwable] = None): Unit = sched.synchronized { // TODO: Kill running tasks if we were not terminated due to a Mesos error - sched.dagScheduler.taskSetFailed(taskSet, message) + sched.dagScheduler.taskSetFailed(taskSet, message, exception) isZombie = true maybeFinishTaskSet() } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 6acf8a9a5e9b..5730a87f960a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -422,16 +422,19 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp logWarning(s"Executor to kill $id does not exist!") } + // If an executor is already pending to be removed, do not kill it again (SPARK-9795) + val executorsToKill = knownExecutors.filter { id => !executorsPendingToRemove.contains(id) } + executorsPendingToRemove ++= executorsToKill + // If we do not wish to replace the executors we kill, sync the target number of executors // with the cluster manager to avoid allocating new ones. When computing the new target, // take into account executors that are pending to be added or removed. if (!replace) { - doRequestTotalExecutors(numExistingExecutors + numPendingExecutors - - executorsPendingToRemove.size - knownExecutors.size) + doRequestTotalExecutors( + numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size) } - executorsPendingToRemove ++= knownExecutors - doKillExecutors(knownExecutors) + doKillExecutors(executorsToKill) } /** diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala index 26e72c0bff38..626a2b7d69ab 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala @@ -22,7 +22,7 @@ import org.apache.spark.rpc.{RpcEndpointRef, RpcAddress} /** * Grouping of data for an executor used by CoarseGrainedSchedulerBackend. * - * @param executorEndpoint The ActorRef representing this executor + * @param executorEndpoint The RpcEndpointRef representing this executor * @param executorAddress The network address of this executor * @param executorHost The hostname that this executor is running on * @param freeCores The current number of cores available for work on the executor diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala index 15a0915708c7..d6e1e9e5bebc 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala @@ -194,6 +194,11 @@ private[spark] class CoarseMesosSchedulerBackend( s" --app-id $appId") command.addUris(CommandInfo.URI.newBuilder().setValue(uri.get)) } + + conf.getOption("spark.mesos.uris").map { uris => + setupUris(uris, command) + } + command.build() } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala index f078547e7135..1206f184fbc8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala @@ -403,6 +403,9 @@ private[spark] class MesosClusterScheduler( } builder.setValue(s"$executable $cmdOptions $jar $appArguments") builder.setEnvironment(envBuilder.build()) + conf.getOption("spark.mesos.uris").map { uris => + setupUris(uris, builder) + } builder.build() } @@ -504,14 +507,16 @@ private[spark] class MesosClusterScheduler( val driversToRetry = pendingRetryDrivers.filter { d => d.retryState.get.nextRetry.before(currentTime) } + scheduleTasks( - driversToRetry, + copyBuffer(driversToRetry), removeFromPendingRetryDrivers, currentOffers, tasks) + // Then we walk through the queued drivers and try to schedule them. scheduleTasks( - queuedDrivers, + copyBuffer(queuedDrivers), removeFromQueuedDrivers, currentOffers, tasks) @@ -524,13 +529,14 @@ private[spark] class MesosClusterScheduler( .foreach(o => driver.declineOffer(o.getId)) } + private def copyBuffer( + buffer: ArrayBuffer[MesosDriverDescription]): ArrayBuffer[MesosDriverDescription] = { + val newBuffer = new ArrayBuffer[MesosDriverDescription](buffer.size) + buffer.copyToBuffer(newBuffer) + newBuffer + } + def getSchedulerState(): MesosClusterSchedulerState = { - def copyBuffer( - buffer: ArrayBuffer[MesosDriverDescription]): ArrayBuffer[MesosDriverDescription] = { - val newBuffer = new ArrayBuffer[MesosDriverDescription](buffer.size) - buffer.copyToBuffer(newBuffer) - newBuffer - } stateLock.synchronized { new MesosClusterSchedulerState( frameworkId, diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index 3f63ec1c5832..033454eddc09 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -32,7 +32,6 @@ import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.util.Utils - /** * A SchedulerBackend for running fine-grained tasks on Mesos. Each Spark task is mapped to a * separate Mesos task, allowing multiple applications to share cluster nodes both in space (tasks @@ -127,12 +126,17 @@ private[spark] class MesosSchedulerBackend( } val builder = MesosExecutorInfo.newBuilder() val (resourcesAfterCpu, usedCpuResources) = - partitionResources(availableResources, "cpus", scheduler.CPUS_PER_TASK) + partitionResources(availableResources, "cpus", mesosExecutorCores) val (resourcesAfterMem, usedMemResources) = partitionResources(resourcesAfterCpu, "mem", calculateTotalMemory(sc)) builder.addAllResources(usedCpuResources) builder.addAllResources(usedMemResources) + + sc.conf.getOption("spark.mesos.uris").map { uris => + setupUris(uris, command) + } + val executorInfo = builder .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala index c04920e4f587..5b854aa5c275 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala @@ -331,4 +331,10 @@ private[mesos] trait MesosSchedulerUtils extends Logging { sc.executorMemory } + def setupUris(uris: String, builder: CommandInfo.Builder): Unit = { + uris.split(",").foreach { uri => + builder.addUris(CommandInfo.URI.newBuilder().setValue(uri.trim())) + } + } + } diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala index fae69551e733..d0163d326dba 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala @@ -71,7 +71,7 @@ private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleB /** * Write an index file with the offsets of each block, plus a final offset at the end for the - * end of the output file. This will be used by getBlockLocation to figure out where each block + * end of the output file. This will be used by getBlockData to figure out where each block * begins and ends. * */ def writeIndexFile(shuffleId: Int, mapId: Int, lengths: Array[Long]): Unit = { diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala index 00c1e078a441..a0d8abc2eecb 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala @@ -19,6 +19,9 @@ package org.apache.spark.shuffle import scala.collection.mutable +import com.google.common.annotations.VisibleForTesting + +import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.{Logging, SparkException, SparkConf, TaskContext} /** @@ -34,11 +37,19 @@ import org.apache.spark.{Logging, SparkException, SparkConf, TaskContext} * set of active tasks and redo the calculations of 1 / 2N and 1 / N in waiting tasks whenever * this set changes. This is all done by synchronizing access on "this" to mutate state and using * wait() and notifyAll() to signal changes. + * + * Use `ShuffleMemoryManager.create()` factory method to create a new instance. + * + * @param maxMemory total amount of memory available for execution, in bytes. + * @param pageSizeBytes number of bytes for each page, by default. */ -private[spark] class ShuffleMemoryManager(maxMemory: Long) extends Logging { - private val taskMemory = new mutable.HashMap[Long, Long]() // taskAttemptId -> memory bytes +private[spark] +class ShuffleMemoryManager protected ( + val maxMemory: Long, + val pageSizeBytes: Long) + extends Logging { - def this(conf: SparkConf) = this(ShuffleMemoryManager.getMaxMemory(conf)) + private val taskMemory = new mutable.HashMap[Long, Long]() // taskAttemptId -> memory bytes private def currentTaskAttemptId(): Long = { // In case this is called on the driver, return an invalid task attempt id. @@ -124,15 +135,51 @@ private[spark] class ShuffleMemoryManager(maxMemory: Long) extends Logging { } } -private object ShuffleMemoryManager { + +private[spark] object ShuffleMemoryManager { + + def create(conf: SparkConf, numCores: Int): ShuffleMemoryManager = { + val maxMemory = ShuffleMemoryManager.getMaxMemory(conf) + val pageSize = ShuffleMemoryManager.getPageSize(conf, maxMemory, numCores) + new ShuffleMemoryManager(maxMemory, pageSize) + } + + def create(maxMemory: Long, pageSizeBytes: Long): ShuffleMemoryManager = { + new ShuffleMemoryManager(maxMemory, pageSizeBytes) + } + + @VisibleForTesting + def createForTesting(maxMemory: Long): ShuffleMemoryManager = { + new ShuffleMemoryManager(maxMemory, 4 * 1024 * 1024) + } + /** * Figure out the shuffle memory limit from a SparkConf. We currently have both a fraction * of the memory pool and a safety factor since collections can sometimes grow bigger than * the size we target before we estimate their sizes again. */ - def getMaxMemory(conf: SparkConf): Long = { + private def getMaxMemory(conf: SparkConf): Long = { val memoryFraction = conf.getDouble("spark.shuffle.memoryFraction", 0.2) val safetyFraction = conf.getDouble("spark.shuffle.safetyFraction", 0.8) (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong } + + /** + * Sets the page size, in bytes. + * + * If user didn't explicitly set "spark.buffer.pageSize", we figure out the default value + * by looking at the number of cores available to the process, and the total amount of memory, + * and then divide it by a factor of safety. + */ + private def getPageSize(conf: SparkConf, maxMemory: Long, numCores: Int): Long = { + val minPageSize = 1L * 1024 * 1024 // 1MB + val maxPageSize = 64L * minPageSize // 64MB + val cores = if (numCores > 0) numCores else Runtime.getRuntime.availableProcessors() + // Because of rounding to next power of 2, we may have safetyFactor as 8 in worst case + val safetyFactor = 16 + // TODO(davies): don't round to next power of 2 + val size = ByteArrayMethods.nextPowerOf2(maxMemory / cores / safetyFactor) + val default = math.min(maxPageSize, math.max(minPageSize, size)) + conf.getSizeAsBytes("spark.buffer.pageSize", default) + } } diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala index de79fa56f017..0c8f08f0f3b1 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala @@ -17,7 +17,7 @@ package org.apache.spark.shuffle.hash -import org.apache.spark.{InterruptibleIterator, Logging, MapOutputTracker, SparkEnv, TaskContext} +import org.apache.spark._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.{BaseShuffleHandle, ShuffleReader} import org.apache.spark.storage.{BlockManager, ShuffleBlockFetcherIterator} @@ -100,8 +100,10 @@ private[spark] class HashShuffleReader[K, C]( // the ExternalSorter won't spill to disk. val sorter = new ExternalSorter[K, C, C](ordering = Some(keyOrd), serializer = Some(ser)) sorter.insertAll(aggregatedIter) - context.taskMetrics.incMemoryBytesSpilled(sorter.memoryBytesSpilled) - context.taskMetrics.incDiskBytesSpilled(sorter.diskBytesSpilled) + context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) + context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) + context.internalMetricsToAccumulators( + InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.peakMemoryUsedBytes) sorter.iterator case None => aggregatedIter diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 86493673d958..eedb27942e84 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -222,7 +222,7 @@ private[spark] class BlockManager( return } catch { case e: Exception if i < MAX_ATTEMPTS => - logError(s"Failed to connect to external shuffle server, will retry ${MAX_ATTEMPTS - i}}" + logError(s"Failed to connect to external shuffle server, will retry ${MAX_ATTEMPTS - i}" + s" more times after waiting $SLEEP_TIME_SECS seconds...", e) Thread.sleep(SLEEP_TIME_SECS * 1000) } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala index f70f701494db..f45bff34d4db 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala @@ -69,8 +69,9 @@ class BlockManagerMaster( } /** Get locations of multiple blockIds from the driver */ - def getLocations(blockIds: Array[BlockId]): Seq[Seq[BlockManagerId]] = { - driverEndpoint.askWithRetry[Seq[Seq[BlockManagerId]]](GetLocationsMultipleBlockIds(blockIds)) + def getLocations(blockIds: Array[BlockId]): IndexedSeq[Seq[BlockManagerId]] = { + driverEndpoint.askWithRetry[IndexedSeq[Seq[BlockManagerId]]]( + GetLocationsMultipleBlockIds(blockIds)) } /** @@ -103,7 +104,7 @@ class BlockManagerMaster( val future = driverEndpoint.askWithRetry[Future[Seq[Int]]](RemoveRdd(rddId)) future.onFailure { case e: Exception => - logWarning(s"Failed to remove RDD $rddId - ${e.getMessage}}", e) + logWarning(s"Failed to remove RDD $rddId - ${e.getMessage}", e) }(ThreadUtils.sameThread) if (blocking) { timeout.awaitResult(future) @@ -115,7 +116,7 @@ class BlockManagerMaster( val future = driverEndpoint.askWithRetry[Future[Seq[Boolean]]](RemoveShuffle(shuffleId)) future.onFailure { case e: Exception => - logWarning(s"Failed to remove shuffle $shuffleId - ${e.getMessage}}", e) + logWarning(s"Failed to remove shuffle $shuffleId - ${e.getMessage}", e) }(ThreadUtils.sameThread) if (blocking) { timeout.awaitResult(future) @@ -129,7 +130,7 @@ class BlockManagerMaster( future.onFailure { case e: Exception => logWarning(s"Failed to remove broadcast $broadcastId" + - s" with removeFromMaster = $removeFromMaster - ${e.getMessage}}", e) + s" with removeFromMaster = $removeFromMaster - ${e.getMessage}", e) }(ThreadUtils.sameThread) if (blocking) { timeout.awaitResult(future) diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala index 5dc0c537cbb6..6fec5240707a 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala @@ -372,7 +372,8 @@ class BlockManagerMasterEndpoint( if (blockLocations.containsKey(blockId)) blockLocations.get(blockId).toSeq else Seq.empty } - private def getLocationsMultipleBlockIds(blockIds: Array[BlockId]): Seq[Seq[BlockManagerId]] = { + private def getLocationsMultipleBlockIds( + blockIds: Array[BlockId]): IndexedSeq[Seq[BlockManagerId]] = { blockIds.map(blockId => getLocations(blockId)) } diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala index 5f537692a16c..f7e84a2c2e14 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala @@ -22,7 +22,7 @@ import java.io.{IOException, File} import org.apache.spark.{SparkConf, Logging} import org.apache.spark.executor.ExecutorExitCode -import org.apache.spark.util.Utils +import org.apache.spark.util.{ShutdownHookManager, Utils} /** * Creates and maintains the logical mapping between logical blocks and physical on-disk @@ -133,7 +133,6 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon Utils.getConfiguredLocalDirs(conf).flatMap { rootDir => try { val localDir = Utils.createDirectory(rootDir, "blockmgr") - Utils.chmod700(localDir) logInfo(s"Created local directory at $localDir") Some(localDir) } catch { @@ -145,7 +144,7 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon } private def addShutdownHook(): AnyRef = { - Utils.addShutdownHook(Utils.TEMP_DIR_SHUTDOWN_PRIORITY + 1) { () => + ShutdownHookManager.addShutdownHook(ShutdownHookManager.TEMP_DIR_SHUTDOWN_PRIORITY + 1) { () => logInfo("Shutdown hook called") DiskBlockManager.this.doStop() } @@ -155,7 +154,7 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon private[spark] def stop() { // Remove the shutdown hook. It causes memory leaks if we leave it around. try { - Utils.removeShutdownHook(shutdownHook) + ShutdownHookManager.removeShutdownHook(shutdownHook) } catch { case e: Exception => logError(s"Exception while removing shutdown hook.", e) @@ -165,11 +164,15 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon private def doStop(): Unit = { // Only perform cleanup if an external service is not serving our shuffle files. - if (!blockManager.externalShuffleServiceEnabled || blockManager.blockManagerId.isDriver) { + // Also blockManagerId could be null if block manager is not initialized properly. + if (!blockManager.externalShuffleServiceEnabled || + (blockManager.blockManagerId != null && blockManager.blockManagerId.isDriver)) { localDirs.foreach { localDir => if (localDir.isDirectory() && localDir.exists()) { try { - if (!Utils.hasRootAsShutdownDeleteDir(localDir)) Utils.deleteRecursively(localDir) + if (!ShutdownHookManager.hasRootAsShutdownDeleteDir(localDir)) { + Utils.deleteRecursively(localDir) + } } catch { case e: Exception => logError(s"Exception while deleting local spark dir: $localDir", e) diff --git a/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala index b53c86e89a27..22878783fca6 100644 --- a/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala @@ -27,11 +27,12 @@ import scala.util.control.NonFatal import com.google.common.io.ByteStreams import tachyon.client.{ReadType, WriteType, TachyonFS, TachyonFile} +import tachyon.conf.TachyonConf import tachyon.TachyonURI -import org.apache.spark.{SparkException, SparkConf, Logging} +import org.apache.spark.Logging import org.apache.spark.executor.ExecutorExitCode -import org.apache.spark.util.Utils +import org.apache.spark.util.{ShutdownHookManager, Utils} /** @@ -60,7 +61,11 @@ private[spark] class TachyonBlockManager() extends ExternalBlockManager with Log rootDirs = s"$storeDir/$appFolderName/$executorId" master = blockManager.conf.get(ExternalBlockStore.MASTER_URL, "tachyon://localhost:19998") - client = if (master != null && master != "") TachyonFS.get(new TachyonURI(master)) else null + client = if (master != null && master != "") { + TachyonFS.get(new TachyonURI(master), new TachyonConf()) + } else { + null + } // original implementation call System.exit, we change it to run without extblkstore support if (client == null) { logError("Failed to connect to the Tachyon as the master address is not configured") @@ -75,7 +80,7 @@ private[spark] class TachyonBlockManager() extends ExternalBlockManager with Log // in order to avoid having really large inodes at the top level in Tachyon. tachyonDirs = createTachyonDirs() subDirs = Array.fill(tachyonDirs.length)(new Array[TachyonFile](subDirsPerTachyonDir)) - tachyonDirs.foreach(tachyonDir => Utils.registerShutdownDeleteDir(tachyonDir)) + tachyonDirs.foreach(tachyonDir => ShutdownHookManager.registerShutdownDeleteDir(tachyonDir)) } override def toString: String = {"ExternalBlockStore-Tachyon"} @@ -235,7 +240,7 @@ private[spark] class TachyonBlockManager() extends ExternalBlockManager with Log logDebug("Shutdown hook called") tachyonDirs.foreach { tachyonDir => try { - if (!Utils.hasRootAsShutdownDeleteDir(tachyonDir)) { + if (!ShutdownHookManager.hasRootAsShutdownDeleteDir(tachyonDir)) { Utils.deleteRecursively(tachyonDir, client) } } catch { diff --git a/core/src/main/scala/org/apache/spark/ui/PagedTable.scala b/core/src/main/scala/org/apache/spark/ui/PagedTable.scala index 17d7b39c2d95..6e2375477a68 100644 --- a/core/src/main/scala/org/apache/spark/ui/PagedTable.scala +++ b/core/src/main/scala/org/apache/spark/ui/PagedTable.scala @@ -159,9 +159,9 @@ private[ui] trait PagedTable[T] { // "goButtonJsFuncName" val formJs = s"""$$(function(){ - | $$( "#form-task-page" ).submit(function(event) { - | var page = $$("#form-task-page-no").val() - | var pageSize = $$("#form-task-page-size").val() + | $$( "#form-$tableId-page" ).submit(function(event) { + | var page = $$("#form-$tableId-page-no").val() + | var pageSize = $$("#form-$tableId-page-size").val() | pageSize = pageSize ? pageSize: 100; | if (page != "") { | ${goButtonJsFuncName}(page, pageSize); @@ -173,12 +173,14 @@ private[ui] trait PagedTable[T] {
-
+ - + - - + +
diff --git a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala index e2d25e36365f..cb122eaed83d 100644 --- a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala +++ b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala @@ -62,6 +62,13 @@ private[spark] object ToolTips { """Time that the executor spent paused for Java garbage collection while the task was running.""" + val PEAK_EXECUTION_MEMORY = + """Execution memory refers to the memory used by internal data structures created during + shuffles, aggregations and joins when Tungsten is enabled. The value of this accumulator + should be approximately the sum of the peak sizes across all such data structures created + in this task. For SQL jobs, this only tracks all unsafe operators, broadcast joins, and + external sort.""" + val JOB_TIMELINE = """Shows when jobs started and ended and when executors joined or left. Drag to scroll. Click Enable Zooming and use mouse wheel to zoom in/out.""" diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 718aea7e1dc2..f2da41772410 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -352,7 +352,8 @@ private[spark] object UIUtils extends Logging { */ private def showDagViz(graphs: Seq[RDDOperationGraph], forJob: Boolean): Seq[Node] = {
- + diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala index f0ae95bb8c81..b0a2cb4aa4d4 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala @@ -49,11 +49,29 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage val maybeThreadDump = sc.get.getExecutorThreadDump(executorId) val content = maybeThreadDump.map { threadDump => - val dumpRows = threadDump.map { thread => + val dumpRows = threadDump.sortWith { + case (threadTrace1, threadTrace2) => { + val v1 = if (threadTrace1.threadName.contains("Executor task launch")) 1 else 0 + val v2 = if (threadTrace2.threadName.contains("Executor task launch")) 1 else 0 + if (v1 == v2) { + threadTrace1.threadName.toLowerCase < threadTrace2.threadName.toLowerCase + } else { + v1 > v2 + } + } + }.map { thread => + val threadName = thread.threadName + val className = "accordion-heading " + { + if (threadName.contains("Executor task launch")) { + "executor-thread" + } else { + "non-executor-thread" + } + }
-
+ @@ -241,11 +253,12 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") { val accumulableTable = UIUtils.listingTable( accumulableHeaders, accumulableRow, - accumulables.values.toSeq) + externalAccumulables.toSeq) val currentTime = System.currentTimeMillis() val (taskTable, taskTableHTML) = try { val _taskTable = new TaskPagedTable( + parent.conf, UIUtils.prependBaseUri(parent.basePath) + s"/stages/stage?id=${stageId}&attempt=${stageAttemptId}", tasks, @@ -294,12 +307,14 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") { else { def getDistributionQuantiles(data: Seq[Double]): IndexedSeq[Double] = Distribution(data).get.getQuantiles() - def getFormattedTimeQuantiles(times: Seq[Double]): Seq[Node] = { getDistributionQuantiles(times).map { millis => {UIUtils.formatDuration(millis.toLong)} } } + def getFormattedSizeQuantiles(data: Seq[Double]): Seq[Elem] = { + getDistributionQuantiles(data).map(d => {Utils.bytesToString(d.toLong)}) + } val deserializationTimes = validTasks.map { case TaskUIData(_, metrics, _) => metrics.get.executorDeserializeTime.toDouble @@ -349,6 +364,23 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") { +: getFormattedTimeQuantiles(gettingResultTimes) + + val peakExecutionMemory = validTasks.map { case TaskUIData(info, _, _) => + info.accumulables + .find { acc => acc.name == InternalAccumulator.PEAK_EXECUTION_MEMORY } + .map { acc => acc.update.getOrElse("0").toLong } + .getOrElse(0L) + .toDouble + } + val peakExecutionMemoryQuantiles = { + + + Peak Execution Memory + + +: getFormattedSizeQuantiles(peakExecutionMemory) + } + // The scheduler delay includes the network delay to send the task to the worker // machine and to send back the result (but not the time to fetch the task result, // if it needed to be fetched from the block manager on the worker). @@ -359,10 +391,6 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") { title={ToolTips.SCHEDULER_DELAY} data-placement="right">Scheduler Delay val schedulerDelayQuantiles = schedulerDelayTitle +: getFormattedTimeQuantiles(schedulerDelays) - - def getFormattedSizeQuantiles(data: Seq[Double]): Seq[Elem] = - getDistributionQuantiles(data).map(d => {Utils.bytesToString(d.toLong)}) - def getFormattedSizeQuantilesWithRecords(data: Seq[Double], records: Seq[Double]) : Seq[Elem] = { val recordDist = getDistributionQuantiles(records).iterator @@ -466,6 +494,13 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") { {serializationQuantiles} , {gettingResultQuantiles}, + if (displayPeakExecutionMemory) { + + {peakExecutionMemoryQuantiles} + + } else { + Nil + }, if (stageData.hasInput) {inputQuantiles} else Nil, if (stageData.hasOutput) {outputQuantiles} else Nil, if (stageData.hasShuffleRead) { @@ -499,7 +534,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") { val executorTable = new ExecutorTable(stageId, stageAttemptId, parent) val maybeAccumulableTable: Seq[Node] = - if (accumulables.size > 0) {

Accumulators

++ accumulableTable } else Seq() + if (hasAccumulators) {

Accumulators

++ accumulableTable } else Seq() val content = summary ++ @@ -750,29 +785,30 @@ private[ui] case class TaskTableRowBytesSpilledData( * Contains all data that needs for sorting and generating HTML. Using this one rather than * TaskUIData to avoid creating duplicate contents during sorting the data. */ -private[ui] case class TaskTableRowData( - index: Int, - taskId: Long, - attempt: Int, - speculative: Boolean, - status: String, - taskLocality: String, - executorIdAndHost: String, - launchTime: Long, - duration: Long, - formatDuration: String, - schedulerDelay: Long, - taskDeserializationTime: Long, - gcTime: Long, - serializationTime: Long, - gettingResultTime: Long, - accumulators: Option[String], // HTML - input: Option[TaskTableRowInputData], - output: Option[TaskTableRowOutputData], - shuffleRead: Option[TaskTableRowShuffleReadData], - shuffleWrite: Option[TaskTableRowShuffleWriteData], - bytesSpilled: Option[TaskTableRowBytesSpilledData], - error: String) +private[ui] class TaskTableRowData( + val index: Int, + val taskId: Long, + val attempt: Int, + val speculative: Boolean, + val status: String, + val taskLocality: String, + val executorIdAndHost: String, + val launchTime: Long, + val duration: Long, + val formatDuration: String, + val schedulerDelay: Long, + val taskDeserializationTime: Long, + val gcTime: Long, + val serializationTime: Long, + val gettingResultTime: Long, + val peakExecutionMemoryUsed: Long, + val accumulators: Option[String], // HTML + val input: Option[TaskTableRowInputData], + val output: Option[TaskTableRowOutputData], + val shuffleRead: Option[TaskTableRowShuffleReadData], + val shuffleWrite: Option[TaskTableRowShuffleWriteData], + val bytesSpilled: Option[TaskTableRowBytesSpilledData], + val error: String) private[ui] class TaskDataSource( tasks: Seq[TaskUIData], @@ -816,10 +852,15 @@ private[ui] class TaskDataSource( val serializationTime = metrics.map(_.resultSerializationTime).getOrElse(0L) val gettingResultTime = getGettingResultTime(info, currentTime) - val maybeAccumulators = info.accumulables - val accumulatorsReadable = maybeAccumulators.map { acc => + val (taskInternalAccumulables, taskExternalAccumulables) = + info.accumulables.partition(_.internal) + val externalAccumulableReadable = taskExternalAccumulables.map { acc => StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update.get}") } + val peakExecutionMemoryUsed = taskInternalAccumulables + .find { acc => acc.name == InternalAccumulator.PEAK_EXECUTION_MEMORY } + .map { acc => acc.update.getOrElse("0").toLong } + .getOrElse(0L) val maybeInput = metrics.flatMap(_.inputMetrics) val inputSortable = maybeInput.map(_.bytesRead).getOrElse(0L) @@ -923,7 +964,7 @@ private[ui] class TaskDataSource( None } - TaskTableRowData( + new TaskTableRowData( info.index, info.taskId, info.attempt, @@ -939,14 +980,14 @@ private[ui] class TaskDataSource( gcTime, serializationTime, gettingResultTime, - if (hasAccumulators) Some(accumulatorsReadable.mkString("
")) else None, + peakExecutionMemoryUsed, + if (hasAccumulators) Some(externalAccumulableReadable.mkString("
")) else None, input, output, shuffleRead, shuffleWrite, bytesSpilled, - errorMessage.getOrElse("") - ) + errorMessage.getOrElse("")) } /** @@ -1006,6 +1047,10 @@ private[ui] class TaskDataSource( override def compare(x: TaskTableRowData, y: TaskTableRowData): Int = Ordering.Long.compare(x.gettingResultTime, y.gettingResultTime) } + case "Peak Execution Memory" => new Ordering[TaskTableRowData] { + override def compare(x: TaskTableRowData, y: TaskTableRowData): Int = + Ordering.Long.compare(x.peakExecutionMemoryUsed, y.peakExecutionMemoryUsed) + } case "Accumulators" => if (hasAccumulators) { new Ordering[TaskTableRowData] { @@ -1132,6 +1177,7 @@ private[ui] class TaskDataSource( } private[ui] class TaskPagedTable( + conf: SparkConf, basePath: String, data: Seq[TaskUIData], hasAccumulators: Boolean, @@ -1143,9 +1189,12 @@ private[ui] class TaskPagedTable( currentTime: Long, pageSize: Int, sortColumn: String, - desc: Boolean) extends PagedTable[TaskTableRowData]{ + desc: Boolean) extends PagedTable[TaskTableRowData] { - override def tableId: String = "" + // We only track peak memory used for unsafe operators + private val displayPeakExecutionMemory = conf.getBoolean("spark.sql.unsafe.enabled", true) + + override def tableId: String = "task-table" override def tableCssClass: String = "table table-bordered table-condensed table-striped" @@ -1160,8 +1209,7 @@ private[ui] class TaskPagedTable( currentTime, pageSize, sortColumn, - desc - ) + desc) override def pageLink(page: Int): String = { val encodedSortColumn = URLEncoder.encode(sortColumn, "UTF-8") @@ -1195,6 +1243,13 @@ private[ui] class TaskPagedTable( ("GC Time", ""), ("Result Serialization Time", TaskDetailsClassNames.RESULT_SERIALIZATION_TIME), ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME)) ++ + { + if (displayPeakExecutionMemory) { + Seq(("Peak Execution Memory", TaskDetailsClassNames.PEAK_EXECUTION_MEMORY)) + } else { + Nil + } + } ++ {if (hasAccumulators) Seq(("Accumulators", "")) else Nil} ++ {if (hasInput) Seq(("Input Size / Records", "")) else Nil} ++ {if (hasOutput) Seq(("Output Size / Records", "")) else Nil} ++ @@ -1218,7 +1273,7 @@ private[ui] class TaskPagedTable( Seq(("Errors", "")) if (!taskHeadersAndCssClasses.map(_._1).contains(sortColumn)) { - new IllegalArgumentException(s"Unknown column: $sortColumn") + throw new IllegalArgumentException(s"Unknown column: $sortColumn") } val headerRow: Seq[Node] = { @@ -1271,6 +1326,11 @@ private[ui] class TaskPagedTable( {UIUtils.formatDuration(task.gettingResultTime)} + {if (displayPeakExecutionMemory) { + + {Utils.bytesToString(task.peakExecutionMemoryUsed)} + + }} {if (task.accumulators.nonEmpty) { {Unparsed(task.accumulators.get)} }} diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/TaskDetailsClassNames.scala b/core/src/main/scala/org/apache/spark/ui/jobs/TaskDetailsClassNames.scala index 9bf67db8acde..d2dfc5a32915 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/TaskDetailsClassNames.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/TaskDetailsClassNames.scala @@ -31,4 +31,5 @@ private[spark] object TaskDetailsClassNames { val SHUFFLE_READ_REMOTE_SIZE = "shuffle_read_remote" val RESULT_SERIALIZATION_TIME = "serialization_time" val GETTING_RESULT_TIME = "getting_result_time" + val PEAK_EXECUTION_MEMORY = "peak_execution_memory" } diff --git a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala index 36943978ff59..fd6cc3ed759b 100644 --- a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala @@ -17,12 +17,13 @@ package org.apache.spark.ui.storage +import java.net.URLEncoder import javax.servlet.http.HttpServletRequest -import scala.xml.Node +import scala.xml.{Node, Unparsed} import org.apache.spark.status.api.v1.{AllRDDResource, RDDDataDistribution, RDDPartitionInfo} -import org.apache.spark.ui.{UIUtils, WebUIPage} +import org.apache.spark.ui.{PagedDataSource, PagedTable, UIUtils, WebUIPage} import org.apache.spark.util.Utils /** Page showing storage details for a given RDD */ @@ -32,6 +33,17 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") { def render(request: HttpServletRequest): Seq[Node] = { val parameterId = request.getParameter("id") require(parameterId != null && parameterId.nonEmpty, "Missing id parameter") + + val parameterBlockPage = request.getParameter("block.page") + val parameterBlockSortColumn = request.getParameter("block.sort") + val parameterBlockSortDesc = request.getParameter("block.desc") + val parameterBlockPageSize = request.getParameter("block.pageSize") + + val blockPage = Option(parameterBlockPage).map(_.toInt).getOrElse(1) + val blockSortColumn = Option(parameterBlockSortColumn).getOrElse("Block Name") + val blockSortDesc = Option(parameterBlockSortDesc).map(_.toBoolean).getOrElse(false) + val blockPageSize = Option(parameterBlockPageSize).map(_.toInt).getOrElse(100) + val rddId = parameterId.toInt val rddStorageInfo = AllRDDResource.getRDDStorageInfo(rddId, listener, includeDetails = true) .getOrElse { @@ -44,8 +56,34 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") { rddStorageInfo.dataDistribution.get, id = Some("rdd-storage-by-worker-table")) // Block table - val blockTable = UIUtils.listingTable(blockHeader, blockRow, rddStorageInfo.partitions.get, - id = Some("rdd-storage-by-block-table")) + val (blockTable, blockTableHTML) = try { + val _blockTable = new BlockPagedTable( + UIUtils.prependBaseUri(parent.basePath) + s"/storage/rdd/?id=${rddId}", + rddStorageInfo.partitions.get, + blockPageSize, + blockSortColumn, + blockSortDesc) + (_blockTable, _blockTable.table(blockPage)) + } catch { + case e @ (_ : IllegalArgumentException | _ : IndexOutOfBoundsException) => + (null,
{e.getMessage}
) + } + + val jsForScrollingDownToBlockTable = + val content =
@@ -85,11 +123,11 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
-
-
-

{rddStorageInfo.partitions.map(_.size).getOrElse(0)} Partitions

- {blockTable} -
+
+

+ {rddStorageInfo.partitions.map(_.size).getOrElse(0)} Partitions +

+ {blockTableHTML ++ jsForScrollingDownToBlockTable}
; UIUtils.headerSparkPage("RDD Storage Info for " + rddStorageInfo.name, content, parent) @@ -101,14 +139,6 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") { "Memory Usage", "Disk Usage") - /** Header fields for the block table */ - private def blockHeader = Seq( - "Block Name", - "Storage Level", - "Size in Memory", - "Size on Disk", - "Executors") - /** Render an HTML row representing a worker */ private def workerRow(worker: RDDDataDistribution): Seq[Node] = { @@ -120,23 +150,157 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") { {Utils.bytesToString(worker.diskUsed)} } +} + +private[ui] case class BlockTableRowData( + blockName: String, + storageLevel: String, + memoryUsed: Long, + diskUsed: Long, + executors: String) + +private[ui] class BlockDataSource( + rddPartitions: Seq[RDDPartitionInfo], + pageSize: Int, + sortColumn: String, + desc: Boolean) extends PagedDataSource[BlockTableRowData](pageSize) { + + private val data = rddPartitions.map(blockRow).sorted(ordering(sortColumn, desc)) + + override def dataSize: Int = data.size + + override def sliceData(from: Int, to: Int): Seq[BlockTableRowData] = { + data.slice(from, to) + } + + private def blockRow(rddPartition: RDDPartitionInfo): BlockTableRowData = { + BlockTableRowData( + rddPartition.blockName, + rddPartition.storageLevel, + rddPartition.memoryUsed, + rddPartition.diskUsed, + rddPartition.executors.mkString(" ")) + } + + /** + * Return Ordering according to sortColumn and desc + */ + private def ordering(sortColumn: String, desc: Boolean): Ordering[BlockTableRowData] = { + val ordering = sortColumn match { + case "Block Name" => new Ordering[BlockTableRowData] { + override def compare(x: BlockTableRowData, y: BlockTableRowData): Int = + Ordering.String.compare(x.blockName, y.blockName) + } + case "Storage Level" => new Ordering[BlockTableRowData] { + override def compare(x: BlockTableRowData, y: BlockTableRowData): Int = + Ordering.String.compare(x.storageLevel, y.storageLevel) + } + case "Size in Memory" => new Ordering[BlockTableRowData] { + override def compare(x: BlockTableRowData, y: BlockTableRowData): Int = + Ordering.Long.compare(x.memoryUsed, y.memoryUsed) + } + case "Size on Disk" => new Ordering[BlockTableRowData] { + override def compare(x: BlockTableRowData, y: BlockTableRowData): Int = + Ordering.Long.compare(x.diskUsed, y.diskUsed) + } + case "Executors" => new Ordering[BlockTableRowData] { + override def compare(x: BlockTableRowData, y: BlockTableRowData): Int = + Ordering.String.compare(x.executors, y.executors) + } + case unknownColumn => throw new IllegalArgumentException(s"Unknown column: $unknownColumn") + } + if (desc) { + ordering.reverse + } else { + ordering + } + } +} + +private[ui] class BlockPagedTable( + basePath: String, + rddPartitions: Seq[RDDPartitionInfo], + pageSize: Int, + sortColumn: String, + desc: Boolean) extends PagedTable[BlockTableRowData] { + + override def tableId: String = "rdd-storage-by-block-table" + + override def tableCssClass: String = "table table-bordered table-condensed table-striped" + + override val dataSource: BlockDataSource = new BlockDataSource( + rddPartitions, + pageSize, + sortColumn, + desc) + + override def pageLink(page: Int): String = { + val encodedSortColumn = URLEncoder.encode(sortColumn, "UTF-8") + s"${basePath}&block.page=$page&block.sort=${encodedSortColumn}&block.desc=${desc}" + + s"&block.pageSize=${pageSize}" + } + + override def goButtonJavascriptFunction: (String, String) = { + val jsFuncName = "goToBlockPage" + val encodedSortColumn = URLEncoder.encode(sortColumn, "UTF-8") + val jsFunc = s""" + |currentBlockPageSize = ${pageSize} + |function goToBlockPage(page, pageSize) { + | // Set page to 1 if the page size changes + | page = pageSize == currentBlockPageSize ? page : 1; + | var url = "${basePath}&block.sort=${encodedSortColumn}&block.desc=${desc}" + + | "&block.page=" + page + "&block.pageSize=" + pageSize; + | window.location.href = url; + |} + """.stripMargin + (jsFuncName, jsFunc) + } - /** Render an HTML row representing a block */ - private def blockRow(row: RDDPartitionInfo): Seq[Node] = { + override def headers: Seq[Node] = { + val blockHeaders = Seq( + "Block Name", + "Storage Level", + "Size in Memory", + "Size on Disk", + "Executors") + + if (!blockHeaders.contains(sortColumn)) { + throw new IllegalArgumentException(s"Unknown column: $sortColumn") + } + + val headerRow: Seq[Node] = { + blockHeaders.map { header => + if (header == sortColumn) { + val headerLink = + s"$basePath&block.sort=${URLEncoder.encode(header, "UTF-8")}&block.desc=${!desc}" + + s"&block.pageSize=${pageSize}" + val js = Unparsed(s"window.location.href='${headerLink}'") + val arrow = if (desc) "▾" else "▴" // UP or DOWN + + {header} +  {Unparsed(arrow)} + + } else { + val headerLink = + s"$basePath&block.sort=${URLEncoder.encode(header, "UTF-8")}" + + s"&block.pageSize=${pageSize}" + val js = Unparsed(s"window.location.href='${headerLink}'") + + {header} + + } + } + } + {headerRow} + } + + override def row(block: BlockTableRowData): Seq[Node] = { - {row.blockName} - - {row.storageLevel} - - - {Utils.bytesToString(row.memoryUsed)} - - - {Utils.bytesToString(row.diskUsed)} - - - {row.executors.map(l => {l}
)} - + {block.blockName} + {block.storageLevel} + {Utils.bytesToString(block.memoryUsed)} + {Utils.bytesToString(block.diskUsed)} + {block.executors} } } diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala index ebead830c646..150d82b3930e 100644 --- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala +++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala @@ -181,7 +181,7 @@ private[spark] object ClosureCleaner extends Logging { return } - logDebug(s"+++ Cleaning closure $func (${func.getClass.getName}}) +++") + logDebug(s"+++ Cleaning closure $func (${func.getClass.getName}) +++") // A list of classes that represents closures enclosed in the given one val innerClasses = getInnerClosureClasses(func) diff --git a/core/src/main/scala/org/apache/spark/util/IdGenerator.scala b/core/src/main/scala/org/apache/spark/util/IdGenerator.scala index 17e55f7996bf..53934ad4ce47 100644 --- a/core/src/main/scala/org/apache/spark/util/IdGenerator.scala +++ b/core/src/main/scala/org/apache/spark/util/IdGenerator.scala @@ -22,10 +22,10 @@ import java.util.concurrent.atomic.AtomicInteger /** * A util used to get a unique generation ID. This is a wrapper around Java's * AtomicInteger. An example usage is in BlockManager, where each BlockManager - * instance would start an Akka actor and we use this utility to assign the Akka - * actors unique names. + * instance would start an RpcEndpoint and we use this utility to assign the RpcEndpoints' + * unique names. */ private[spark] class IdGenerator { - private var id = new AtomicInteger + private val id = new AtomicInteger def next: Int = id.incrementAndGet } diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala index c600319d9ddb..cbc94fd6d54d 100644 --- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala @@ -790,7 +790,7 @@ private[spark] object JsonProtocol { val fullStackTrace = Utils.jsonOption(json \ "Full Stack Trace"). map(_.extract[String]).orNull val metrics = Utils.jsonOption(json \ "Metrics").map(taskMetricsFromJson) - ExceptionFailure(className, description, stackTrace, fullStackTrace, metrics) + ExceptionFailure(className, description, stackTrace, fullStackTrace, metrics, None) case `taskResultLost` => TaskResultLost case `taskKilled` => TaskKilled case `executorLostFailure` => diff --git a/core/src/main/scala/org/apache/spark/util/ManualClock.scala b/core/src/main/scala/org/apache/spark/util/ManualClock.scala index 171855406198..e7a65d74a440 100644 --- a/core/src/main/scala/org/apache/spark/util/ManualClock.scala +++ b/core/src/main/scala/org/apache/spark/util/ManualClock.scala @@ -58,7 +58,7 @@ private[spark] class ManualClock(private var time: Long) extends Clock { */ def waitTillTime(targetTime: Long): Long = synchronized { while (time < targetTime) { - wait(100) + wait(10) } getTimeMillis() } diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala new file mode 100644 index 000000000000..61ff9b89ec1c --- /dev/null +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +import java.io.File +import java.util.PriorityQueue + +import scala.util.{Failure, Success, Try} +import tachyon.client.TachyonFile + +import org.apache.hadoop.fs.FileSystem +import org.apache.spark.Logging + +/** + * Various utility methods used by Spark. + */ +private[spark] object ShutdownHookManager extends Logging { + val DEFAULT_SHUTDOWN_PRIORITY = 100 + + /** + * The shutdown priority of the SparkContext instance. This is lower than the default + * priority, so that by default hooks are run before the context is shut down. + */ + val SPARK_CONTEXT_SHUTDOWN_PRIORITY = 50 + + /** + * The shutdown priority of temp directory must be lower than the SparkContext shutdown + * priority. Otherwise cleaning the temp directories while Spark jobs are running can + * throw undesirable errors at the time of shutdown. + */ + val TEMP_DIR_SHUTDOWN_PRIORITY = 25 + + private lazy val shutdownHooks = { + val manager = new SparkShutdownHookManager() + manager.install() + manager + } + + private val shutdownDeletePaths = new scala.collection.mutable.HashSet[String]() + private val shutdownDeleteTachyonPaths = new scala.collection.mutable.HashSet[String]() + + // Add a shutdown hook to delete the temp dirs when the JVM exits + addShutdownHook(TEMP_DIR_SHUTDOWN_PRIORITY) { () => + logInfo("Shutdown hook called") + shutdownDeletePaths.foreach { dirPath => + try { + logInfo("Deleting directory " + dirPath) + Utils.deleteRecursively(new File(dirPath)) + } catch { + case e: Exception => logError(s"Exception while deleting Spark temp dir: $dirPath", e) + } + } + } + + // Register the path to be deleted via shutdown hook + def registerShutdownDeleteDir(file: File) { + val absolutePath = file.getAbsolutePath() + shutdownDeletePaths.synchronized { + shutdownDeletePaths += absolutePath + } + } + + // Register the tachyon path to be deleted via shutdown hook + def registerShutdownDeleteDir(tachyonfile: TachyonFile) { + val absolutePath = tachyonfile.getPath() + shutdownDeleteTachyonPaths.synchronized { + shutdownDeleteTachyonPaths += absolutePath + } + } + + // Remove the path to be deleted via shutdown hook + def removeShutdownDeleteDir(file: File) { + val absolutePath = file.getAbsolutePath() + shutdownDeletePaths.synchronized { + shutdownDeletePaths.remove(absolutePath) + } + } + + // Remove the tachyon path to be deleted via shutdown hook + def removeShutdownDeleteDir(tachyonfile: TachyonFile) { + val absolutePath = tachyonfile.getPath() + shutdownDeleteTachyonPaths.synchronized { + shutdownDeleteTachyonPaths.remove(absolutePath) + } + } + + // Is the path already registered to be deleted via a shutdown hook ? + def hasShutdownDeleteDir(file: File): Boolean = { + val absolutePath = file.getAbsolutePath() + shutdownDeletePaths.synchronized { + shutdownDeletePaths.contains(absolutePath) + } + } + + // Is the path already registered to be deleted via a shutdown hook ? + def hasShutdownDeleteTachyonDir(file: TachyonFile): Boolean = { + val absolutePath = file.getPath() + shutdownDeleteTachyonPaths.synchronized { + shutdownDeleteTachyonPaths.contains(absolutePath) + } + } + + // Note: if file is child of some registered path, while not equal to it, then return true; + // else false. This is to ensure that two shutdown hooks do not try to delete each others + // paths - resulting in IOException and incomplete cleanup. + def hasRootAsShutdownDeleteDir(file: File): Boolean = { + val absolutePath = file.getAbsolutePath() + val retval = shutdownDeletePaths.synchronized { + shutdownDeletePaths.exists { path => + !absolutePath.equals(path) && absolutePath.startsWith(path) + } + } + if (retval) { + logInfo("path = " + file + ", already present as root for deletion.") + } + retval + } + + // Note: if file is child of some registered path, while not equal to it, then return true; + // else false. This is to ensure that two shutdown hooks do not try to delete each others + // paths - resulting in Exception and incomplete cleanup. + def hasRootAsShutdownDeleteDir(file: TachyonFile): Boolean = { + val absolutePath = file.getPath() + val retval = shutdownDeleteTachyonPaths.synchronized { + shutdownDeleteTachyonPaths.exists { path => + !absolutePath.equals(path) && absolutePath.startsWith(path) + } + } + if (retval) { + logInfo("path = " + file + ", already present as root for deletion.") + } + retval + } + + /** + * Detect whether this thread might be executing a shutdown hook. Will always return true if + * the current thread is a running a shutdown hook but may spuriously return true otherwise (e.g. + * if System.exit was just called by a concurrent thread). + * + * Currently, this detects whether the JVM is shutting down by Runtime#addShutdownHook throwing + * an IllegalStateException. + */ + def inShutdown(): Boolean = { + try { + val hook = new Thread { + override def run() {} + } + Runtime.getRuntime.addShutdownHook(hook) + Runtime.getRuntime.removeShutdownHook(hook) + } catch { + case ise: IllegalStateException => return true + } + false + } + + /** + * Adds a shutdown hook with default priority. + * + * @param hook The code to run during shutdown. + * @return A handle that can be used to unregister the shutdown hook. + */ + def addShutdownHook(hook: () => Unit): AnyRef = { + addShutdownHook(DEFAULT_SHUTDOWN_PRIORITY)(hook) + } + + /** + * Adds a shutdown hook with the given priority. Hooks with lower priority values run + * first. + * + * @param hook The code to run during shutdown. + * @return A handle that can be used to unregister the shutdown hook. + */ + def addShutdownHook(priority: Int)(hook: () => Unit): AnyRef = { + shutdownHooks.add(priority, hook) + } + + /** + * Remove a previously installed shutdown hook. + * + * @param ref A handle returned by `addShutdownHook`. + * @return Whether the hook was removed. + */ + def removeShutdownHook(ref: AnyRef): Boolean = { + shutdownHooks.remove(ref) + } + +} + +private [util] class SparkShutdownHookManager { + + private val hooks = new PriorityQueue[SparkShutdownHook]() + private var shuttingDown = false + + /** + * Install a hook to run at shutdown and run all registered hooks in order. Hadoop 1.x does not + * have `ShutdownHookManager`, so in that case we just use the JVM's `Runtime` object and hope for + * the best. + */ + def install(): Unit = { + val hookTask = new Runnable() { + override def run(): Unit = runAll() + } + Try(Utils.classForName("org.apache.hadoop.util.ShutdownHookManager")) match { + case Success(shmClass) => + val fsPriority = classOf[FileSystem].getField("SHUTDOWN_HOOK_PRIORITY").get() + .asInstanceOf[Int] + val shm = shmClass.getMethod("get").invoke(null) + shm.getClass().getMethod("addShutdownHook", classOf[Runnable], classOf[Int]) + .invoke(shm, hookTask, Integer.valueOf(fsPriority + 30)) + + case Failure(_) => + Runtime.getRuntime.addShutdownHook(new Thread(hookTask, "Spark Shutdown Hook")); + } + } + + def runAll(): Unit = synchronized { + shuttingDown = true + while (!hooks.isEmpty()) { + Try(Utils.logUncaughtExceptions(hooks.poll().run())) + } + } + + def add(priority: Int, hook: () => Unit): AnyRef = synchronized { + checkState() + val hookRef = new SparkShutdownHook(priority, hook) + hooks.add(hookRef) + hookRef + } + + def remove(ref: AnyRef): Boolean = synchronized { + hooks.remove(ref) + } + + private def checkState(): Unit = { + if (shuttingDown) { + throw new IllegalStateException("Shutdown hooks cannot be modified during shutdown.") + } + } + +} + +private class SparkShutdownHook(private val priority: Int, hook: () => Unit) + extends Comparable[SparkShutdownHook] { + + override def compareTo(other: SparkShutdownHook): Int = { + other.priority - priority + } + + def run(): Unit = hook() + +} diff --git a/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala b/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala index ad3db1fbb57e..724818724733 100644 --- a/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala +++ b/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala @@ -33,7 +33,7 @@ private[spark] object SparkUncaughtExceptionHandler // We may have been called from a shutdown hook. If so, we must not call System.exit(). // (If we do, we will deadlock.) - if (!Utils.inShutdown()) { + if (!ShutdownHookManager.inShutdown()) { if (exception.isInstanceOf[OutOfMemoryError]) { System.exit(SparkExitCode.OOM) } else { diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index c4012d0e83f7..831331222671 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -21,7 +21,7 @@ import java.io._ import java.lang.management.ManagementFactory import java.net._ import java.nio.ByteBuffer -import java.util.{PriorityQueue, Properties, Locale, Random, UUID} +import java.util.{Properties, Locale, Random, UUID} import java.util.concurrent._ import javax.net.ssl.HttpsURLConnection @@ -65,21 +65,6 @@ private[spark] object CallSite { private[spark] object Utils extends Logging { val random = new Random() - val DEFAULT_SHUTDOWN_PRIORITY = 100 - - /** - * The shutdown priority of the SparkContext instance. This is lower than the default - * priority, so that by default hooks are run before the context is shut down. - */ - val SPARK_CONTEXT_SHUTDOWN_PRIORITY = 50 - - /** - * The shutdown priority of temp directory must be lower than the SparkContext shutdown - * priority. Otherwise cleaning the temp directories while Spark jobs are running can - * throw undesirable errors at the time of shutdown. - */ - val TEMP_DIR_SHUTDOWN_PRIORITY = 25 - /** * Define a default value for driver memory here since this value is referenced across the code * base and nearly all files already use Utils.scala @@ -90,9 +75,6 @@ private[spark] object Utils extends Logging { @volatile private var localRootDirs: Array[String] = null - private val shutdownHooks = new SparkShutdownHookManager() - shutdownHooks.install() - /** Serialize an object using Java serialization */ def serialize[T](o: T): Array[Byte] = { val bos = new ByteArrayOutputStream() @@ -205,86 +187,6 @@ private[spark] object Utils extends Logging { } } - private val shutdownDeletePaths = new scala.collection.mutable.HashSet[String]() - private val shutdownDeleteTachyonPaths = new scala.collection.mutable.HashSet[String]() - - // Add a shutdown hook to delete the temp dirs when the JVM exits - addShutdownHook(TEMP_DIR_SHUTDOWN_PRIORITY) { () => - logInfo("Shutdown hook called") - shutdownDeletePaths.foreach { dirPath => - try { - logInfo("Deleting directory " + dirPath) - Utils.deleteRecursively(new File(dirPath)) - } catch { - case e: Exception => logError(s"Exception while deleting Spark temp dir: $dirPath", e) - } - } - } - - // Register the path to be deleted via shutdown hook - def registerShutdownDeleteDir(file: File) { - val absolutePath = file.getAbsolutePath() - shutdownDeletePaths.synchronized { - shutdownDeletePaths += absolutePath - } - } - - // Register the tachyon path to be deleted via shutdown hook - def registerShutdownDeleteDir(tachyonfile: TachyonFile) { - val absolutePath = tachyonfile.getPath() - shutdownDeleteTachyonPaths.synchronized { - shutdownDeleteTachyonPaths += absolutePath - } - } - - // Is the path already registered to be deleted via a shutdown hook ? - def hasShutdownDeleteDir(file: File): Boolean = { - val absolutePath = file.getAbsolutePath() - shutdownDeletePaths.synchronized { - shutdownDeletePaths.contains(absolutePath) - } - } - - // Is the path already registered to be deleted via a shutdown hook ? - def hasShutdownDeleteTachyonDir(file: TachyonFile): Boolean = { - val absolutePath = file.getPath() - shutdownDeleteTachyonPaths.synchronized { - shutdownDeleteTachyonPaths.contains(absolutePath) - } - } - - // Note: if file is child of some registered path, while not equal to it, then return true; - // else false. This is to ensure that two shutdown hooks do not try to delete each others - // paths - resulting in IOException and incomplete cleanup. - def hasRootAsShutdownDeleteDir(file: File): Boolean = { - val absolutePath = file.getAbsolutePath() - val retval = shutdownDeletePaths.synchronized { - shutdownDeletePaths.exists { path => - !absolutePath.equals(path) && absolutePath.startsWith(path) - } - } - if (retval) { - logInfo("path = " + file + ", already present as root for deletion.") - } - retval - } - - // Note: if file is child of some registered path, while not equal to it, then return true; - // else false. This is to ensure that two shutdown hooks do not try to delete each others - // paths - resulting in Exception and incomplete cleanup. - def hasRootAsShutdownDeleteDir(file: TachyonFile): Boolean = { - val absolutePath = file.getPath() - val retval = shutdownDeleteTachyonPaths.synchronized { - shutdownDeleteTachyonPaths.exists { path => - !absolutePath.equals(path) && absolutePath.startsWith(path) - } - } - if (retval) { - logInfo("path = " + file + ", already present as root for deletion.") - } - retval - } - /** * JDK equivalent of `chmod 700 file`. * @@ -333,7 +235,7 @@ private[spark] object Utils extends Logging { root: String = System.getProperty("java.io.tmpdir"), namePrefix: String = "spark"): File = { val dir = createDirectory(root, namePrefix) - registerShutdownDeleteDir(dir) + ShutdownHookManager.registerShutdownDeleteDir(dir) dir } @@ -973,9 +875,7 @@ private[spark] object Utils extends Logging { if (savedIOException != null) { throw savedIOException } - shutdownDeletePaths.synchronized { - shutdownDeletePaths.remove(file.getAbsolutePath) - } + ShutdownHookManager.removeShutdownDeleteDir(file) } } finally { if (!file.delete()) { @@ -1466,7 +1366,7 @@ private[spark] object Utils extends Logging { file.getAbsolutePath, effectiveStartIndex, effectiveEndIndex)) } sum += fileToLength(file) - logDebug(s"After processing file $file, string built is ${stringBuffer.toString}}") + logDebug(s"After processing file $file, string built is ${stringBuffer.toString}") } stringBuffer.toString } @@ -1478,27 +1378,6 @@ private[spark] object Utils extends Logging { serializer.deserialize[T](serializer.serialize(value)) } - /** - * Detect whether this thread might be executing a shutdown hook. Will always return true if - * the current thread is a running a shutdown hook but may spuriously return true otherwise (e.g. - * if System.exit was just called by a concurrent thread). - * - * Currently, this detects whether the JVM is shutting down by Runtime#addShutdownHook throwing - * an IllegalStateException. - */ - def inShutdown(): Boolean = { - try { - val hook = new Thread { - override def run() {} - } - Runtime.getRuntime.addShutdownHook(hook) - Runtime.getRuntime.removeShutdownHook(hook) - } catch { - case ise: IllegalStateException => return true - } - false - } - private def isSpace(c: Char): Boolean = { " \t\r\n".indexOf(c) != -1 } @@ -2221,37 +2100,6 @@ private[spark] object Utils extends Logging { msg.startsWith(BACKUP_STANDALONE_MASTER_PREFIX) } - /** - * Adds a shutdown hook with default priority. - * - * @param hook The code to run during shutdown. - * @return A handle that can be used to unregister the shutdown hook. - */ - def addShutdownHook(hook: () => Unit): AnyRef = { - addShutdownHook(DEFAULT_SHUTDOWN_PRIORITY)(hook) - } - - /** - * Adds a shutdown hook with the given priority. Hooks with lower priority values run - * first. - * - * @param hook The code to run during shutdown. - * @return A handle that can be used to unregister the shutdown hook. - */ - def addShutdownHook(priority: Int)(hook: () => Unit): AnyRef = { - shutdownHooks.add(priority, hook) - } - - /** - * Remove a previously installed shutdown hook. - * - * @param ref A handle returned by `addShutdownHook`. - * @return Whether the hook was removed. - */ - def removeShutdownHook(ref: AnyRef): Boolean = { - shutdownHooks.remove(ref) - } - /** * To avoid calling `Utils.getCallSite` for every single RDD we create in the body, * set a dummy call site that RDDs use instead. This is for performance optimization. @@ -2286,70 +2134,17 @@ private[spark] object Utils extends Logging { isInDirectory(parent, child.getParentFile) } -} - -private [util] class SparkShutdownHookManager { - - private val hooks = new PriorityQueue[SparkShutdownHook]() - private var shuttingDown = false - /** - * Install a hook to run at shutdown and run all registered hooks in order. Hadoop 1.x does not - * have `ShutdownHookManager`, so in that case we just use the JVM's `Runtime` object and hope for - * the best. + * Return whether dynamic allocation is enabled in the given conf + * Dynamic allocation and explicitly setting the number of executors are inherently + * incompatible. In environments where dynamic allocation is turned on by default, + * the latter should override the former (SPARK-9092). */ - def install(): Unit = { - val hookTask = new Runnable() { - override def run(): Unit = runAll() - } - Try(Utils.classForName("org.apache.hadoop.util.ShutdownHookManager")) match { - case Success(shmClass) => - val fsPriority = classOf[FileSystem].getField("SHUTDOWN_HOOK_PRIORITY").get() - .asInstanceOf[Int] - val shm = shmClass.getMethod("get").invoke(null) - shm.getClass().getMethod("addShutdownHook", classOf[Runnable], classOf[Int]) - .invoke(shm, hookTask, Integer.valueOf(fsPriority + 30)) - - case Failure(_) => - Runtime.getRuntime.addShutdownHook(new Thread(hookTask, "Spark Shutdown Hook")); - } + def isDynamicAllocationEnabled(conf: SparkConf): Boolean = { + conf.getBoolean("spark.dynamicAllocation.enabled", false) && + conf.getInt("spark.executor.instances", 0) == 0 } - def runAll(): Unit = synchronized { - shuttingDown = true - while (!hooks.isEmpty()) { - Try(Utils.logUncaughtExceptions(hooks.poll().run())) - } - } - - def add(priority: Int, hook: () => Unit): AnyRef = synchronized { - checkState() - val hookRef = new SparkShutdownHook(priority, hook) - hooks.add(hookRef) - hookRef - } - - def remove(ref: AnyRef): Boolean = synchronized { - hooks.remove(ref) - } - - private def checkState(): Unit = { - if (shuttingDown) { - throw new IllegalStateException("Shutdown hooks cannot be modified during shutdown.") - } - } - -} - -private class SparkShutdownHook(private val priority: Int, hook: () => Unit) - extends Comparable[SparkShutdownHook] { - - override def compareTo(other: SparkShutdownHook): Int = { - other.priority - priority - } - - def run(): Unit = hook() - } /** diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala index d166037351c3..f929b12606f0 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala @@ -89,6 +89,7 @@ class ExternalAppendOnlyMap[K, V, C]( // Number of bytes spilled in total private var _diskBytesSpilled = 0L + def diskBytesSpilled: Long = _diskBytesSpilled // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided private val fileBufferSize = @@ -97,6 +98,10 @@ class ExternalAppendOnlyMap[K, V, C]( // Write metrics for current spill private var curWriteMetrics: ShuffleWriteMetrics = _ + // Peak size of the in-memory map observed so far, in bytes + private var _peakMemoryUsedBytes: Long = 0L + def peakMemoryUsedBytes: Long = _peakMemoryUsedBytes + private val keyComparator = new HashComparator[K] private val ser = serializer.newInstance() @@ -126,7 +131,11 @@ class ExternalAppendOnlyMap[K, V, C]( while (entries.hasNext) { curEntry = entries.next() - if (maybeSpill(currentMap, currentMap.estimateSize())) { + val estimatedSize = currentMap.estimateSize() + if (estimatedSize > _peakMemoryUsedBytes) { + _peakMemoryUsedBytes = estimatedSize + } + if (maybeSpill(currentMap, estimatedSize)) { currentMap = new SizeTrackingAppendOnlyMap[K, C] } currentMap.changeValue(curEntry._1, update) @@ -207,8 +216,6 @@ class ExternalAppendOnlyMap[K, V, C]( spilledMaps.append(new DiskMapIterator(file, blockId, batchSizes)) } - def diskBytesSpilled: Long = _diskBytesSpilled - /** * Return an iterator that merges the in-memory map with the spilled maps. * If no spill has occurred, simply return the in-memory map's iterator. diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala index ba7ec834d622..138c05dff19e 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala @@ -152,6 +152,9 @@ private[spark] class ExternalSorter[K, V, C]( private var _diskBytesSpilled = 0L def diskBytesSpilled: Long = _diskBytesSpilled + // Peak size of the in-memory data structure observed so far, in bytes + private var _peakMemoryUsedBytes: Long = 0L + def peakMemoryUsedBytes: Long = _peakMemoryUsedBytes // A comparator for keys K that orders them within a partition to allow aggregation or sorting. // Can be a partial ordering by hash code if a total ordering is not provided through by the @@ -185,6 +188,12 @@ private[spark] class ExternalSorter[K, V, C]( private val spills = new ArrayBuffer[SpilledFile] + /** + * Number of files this sorter has spilled so far. + * Exposed for testing. + */ + private[spark] def numSpills: Int = spills.size + override def insertAll(records: Iterator[Product2[K, V]]): Unit = { // TODO: stop combining if we find that the reduction factor isn't high val shouldCombine = aggregator.isDefined @@ -224,15 +233,22 @@ private[spark] class ExternalSorter[K, V, C]( return } + var estimatedSize = 0L if (usingMap) { - if (maybeSpill(map, map.estimateSize())) { + estimatedSize = map.estimateSize() + if (maybeSpill(map, estimatedSize)) { map = new PartitionedAppendOnlyMap[K, C] } } else { - if (maybeSpill(buffer, buffer.estimateSize())) { + estimatedSize = buffer.estimateSize() + if (maybeSpill(buffer, estimatedSize)) { buffer = newBuffer() } } + + if (estimatedSize > _peakMemoryUsedBytes) { + _peakMemoryUsedBytes = estimatedSize + } } /** @@ -684,8 +700,10 @@ private[spark] class ExternalSorter[K, V, C]( } } - context.taskMetrics.incMemoryBytesSpilled(memoryBytesSpilled) - context.taskMetrics.incDiskBytesSpilled(diskBytesSpilled) + context.taskMetrics().incMemoryBytesSpilled(memoryBytesSpilled) + context.taskMetrics().incDiskBytesSpilled(diskBytesSpilled) + context.internalMetricsToAccumulators( + InternalAccumulator.PEAK_EXECUTION_MEMORY).add(peakMemoryUsedBytes) lengths } diff --git a/core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala b/core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala index 786b97ad7b9e..c156b03cdb7c 100644 --- a/core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala +++ b/core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala @@ -176,10 +176,15 @@ class BernoulliSampler[T: ClassTag](fraction: Double) extends RandomSampler[T, T * A sampler for sampling with replacement, based on values drawn from Poisson distribution. * * @param fraction the sampling fraction (with replacement) + * @param useGapSamplingIfPossible if true, use gap sampling when sampling ratio is low. * @tparam T item type */ @DeveloperApi -class PoissonSampler[T: ClassTag](fraction: Double) extends RandomSampler[T, T] { +class PoissonSampler[T: ClassTag]( + fraction: Double, + useGapSamplingIfPossible: Boolean) extends RandomSampler[T, T] { + + def this(fraction: Double) = this(fraction, useGapSamplingIfPossible = true) /** Epsilon slop to avoid failure from floating point jitter. */ require( @@ -199,17 +204,18 @@ class PoissonSampler[T: ClassTag](fraction: Double) extends RandomSampler[T, T] override def sample(items: Iterator[T]): Iterator[T] = { if (fraction <= 0.0) { Iterator.empty - } else if (fraction <= RandomSampler.defaultMaxGapSamplingFraction) { - new GapSamplingReplacementIterator(items, fraction, rngGap, RandomSampler.rngEpsilon) + } else if (useGapSamplingIfPossible && + fraction <= RandomSampler.defaultMaxGapSamplingFraction) { + new GapSamplingReplacementIterator(items, fraction, rngGap, RandomSampler.rngEpsilon) } else { - items.flatMap { item => { + items.flatMap { item => val count = rng.sample() if (count == 0) Iterator.empty else Iterator.fill(count)(item) - }} + } } } - override def clone: PoissonSampler[T] = new PoissonSampler[T](fraction) + override def clone: PoissonSampler[T] = new PoissonSampler[T](fraction, useGapSamplingIfPossible) } diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index e948ca33471a..ffe4b4baffb2 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -51,7 +51,6 @@ import org.apache.spark.api.java.*; import org.apache.spark.api.java.function.*; -import org.apache.spark.executor.TaskMetrics; import org.apache.spark.input.PortableDataStream; import org.apache.spark.partial.BoundedDouble; import org.apache.spark.partial.PartialResult; @@ -1011,7 +1010,7 @@ public void persist() { @Test public void iterator() { JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 2); - TaskContext context = new TaskContextImpl(0, 0, 0L, 0, null, null, false, new TaskMetrics()); + TaskContext context = TaskContext$.MODULE$.empty(); Assert.assertEquals(1, rdd.iterator(rdd.partitions().get(0), context).next().intValue()); } diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/PackedRecordPointerSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/PackedRecordPointerSuite.java index db9e82759090..934b7e03050b 100644 --- a/core/src/test/java/org/apache/spark/shuffle/unsafe/PackedRecordPointerSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/PackedRecordPointerSuite.java @@ -32,8 +32,8 @@ public class PackedRecordPointerSuite { public void heap() { final TaskMemoryManager memoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP)); - final MemoryBlock page0 = memoryManager.allocatePage(100); - final MemoryBlock page1 = memoryManager.allocatePage(100); + final MemoryBlock page0 = memoryManager.allocatePage(128); + final MemoryBlock page1 = memoryManager.allocatePage(128); final long addressInPage1 = memoryManager.encodePageNumberAndOffset(page1, page1.getBaseOffset() + 42); PackedRecordPointer packedPointer = new PackedRecordPointer(); @@ -50,8 +50,8 @@ public void heap() { public void offHeap() { final TaskMemoryManager memoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.UNSAFE)); - final MemoryBlock page0 = memoryManager.allocatePage(100); - final MemoryBlock page1 = memoryManager.allocatePage(100); + final MemoryBlock page0 = memoryManager.allocatePage(128); + final MemoryBlock page1 = memoryManager.allocatePage(128); final long addressInPage1 = memoryManager.encodePageNumberAndOffset(page1, page1.getBaseOffset() + 42); PackedRecordPointer packedPointer = new PackedRecordPointer(); diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleInMemorySorterSuite.java index 8fa72597db24..40fefe2c9d14 100644 --- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleInMemorySorterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleInMemorySorterSuite.java @@ -24,7 +24,7 @@ import org.junit.Test; import org.apache.spark.HashPartitioner; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.memory.ExecutorMemoryManager; import org.apache.spark.unsafe.memory.MemoryAllocator; import org.apache.spark.unsafe.memory.MemoryBlock; @@ -34,11 +34,7 @@ public class UnsafeShuffleInMemorySorterSuite { private static String getStringFromDataPage(Object baseObject, long baseOffset, int strLength) { final byte[] strBytes = new byte[strLength]; - PlatformDependent.copyMemory( - baseObject, - baseOffset, - strBytes, - PlatformDependent.BYTE_ARRAY_OFFSET, strLength); + Platform.copyMemory(baseObject, baseOffset, strBytes, Platform.BYTE_ARRAY_OFFSET, strLength); return new String(strBytes); } @@ -74,14 +70,10 @@ public void testBasicSorting() throws Exception { for (String str : dataToSort) { final long recordAddress = memoryManager.encodePageNumberAndOffset(dataPage, position); final byte[] strBytes = str.getBytes("utf-8"); - PlatformDependent.UNSAFE.putInt(baseObject, position, strBytes.length); + Platform.putInt(baseObject, position, strBytes.length); position += 4; - PlatformDependent.copyMemory( - strBytes, - PlatformDependent.BYTE_ARRAY_OFFSET, - baseObject, - position, - strBytes.length); + Platform.copyMemory( + strBytes, Platform.BYTE_ARRAY_OFFSET, baseObject, position, strBytes.length); position += strBytes.length; sorter.insertRecord(recordAddress, hashPartitioner.getPartition(str)); } @@ -98,7 +90,7 @@ public void testBasicSorting() throws Exception { Assert.assertTrue("Partition id " + partitionId + " should be >= prev id " + prevPartitionId, partitionId >= prevPartitionId); final long recordAddress = iter.packedRecordPointer.getRecordPointer(); - final int recordLength = PlatformDependent.UNSAFE.getInt( + final int recordLength = Platform.getInt( memoryManager.getPage(recordAddress), memoryManager.getOffsetInPage(recordAddress)); final String str = getStringFromDataPage( memoryManager.getPage(recordAddress), diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java index 04fc09b323db..a266b0c36e0f 100644 --- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java @@ -115,6 +115,7 @@ public void setUp() throws IOException { taskMetrics = new TaskMetrics(); when(shuffleMemoryManager.tryToAcquire(anyLong())).then(returnsFirstArg()); + when(shuffleMemoryManager.pageSizeBytes()).thenReturn(128L * 1024 * 1024); when(blockManager.diskBlockManager()).thenReturn(diskBlockManager); when(blockManager.getDiskWriter( @@ -190,6 +191,7 @@ public Tuple2 answer( }); when(taskContext.taskMetrics()).thenReturn(taskMetrics); + when(taskContext.internalMetricsToAccumulators()).thenReturn(null); when(shuffleDep.serializer()).thenReturn(Option.apply(serializer)); when(shuffleDep.partitioner()).thenReturn(hashPartitioner); @@ -473,62 +475,22 @@ public void writeRecordsThatAreBiggerThanDiskWriteBufferSize() throws Exception @Test public void writeRecordsThatAreBiggerThanMaxRecordSize() throws Exception { - // Use a custom serializer so that we have exact control over the size of serialized data. - final Serializer byteArraySerializer = new Serializer() { - @Override - public SerializerInstance newInstance() { - return new SerializerInstance() { - @Override - public SerializationStream serializeStream(final OutputStream s) { - return new SerializationStream() { - @Override - public void flush() { } - - @Override - public SerializationStream writeObject(T t, ClassTag ev1) { - byte[] bytes = (byte[]) t; - try { - s.write(bytes); - } catch (IOException e) { - throw new RuntimeException(e); - } - return this; - } - - @Override - public void close() { } - }; - } - public ByteBuffer serialize(T t, ClassTag ev1) { return null; } - public DeserializationStream deserializeStream(InputStream s) { return null; } - public T deserialize(ByteBuffer b, ClassLoader l, ClassTag ev1) { return null; } - public T deserialize(ByteBuffer bytes, ClassTag ev1) { return null; } - }; - } - }; - when(shuffleDep.serializer()).thenReturn(Option.apply(byteArraySerializer)); final UnsafeShuffleWriter writer = createWriter(false); - // Insert a record and force a spill so that there's something to clean up: - writer.insertRecordIntoSorter(new Tuple2(new byte[1], new byte[1])); - writer.forceSorterToSpill(); + final ArrayList> dataToWrite = new ArrayList>(); + dataToWrite.add(new Tuple2(1, ByteBuffer.wrap(new byte[1]))); // We should be able to write a record that's right _at_ the max record size final byte[] atMaxRecordSize = new byte[writer.maxRecordSizeBytes()]; new Random(42).nextBytes(atMaxRecordSize); - writer.insertRecordIntoSorter(new Tuple2(new byte[0], atMaxRecordSize)); - writer.forceSorterToSpill(); - // Inserting a record that's larger than the max record size should fail: + dataToWrite.add(new Tuple2(2, ByteBuffer.wrap(atMaxRecordSize))); + // Inserting a record that's larger than the max record size final byte[] exceedsMaxRecordSize = new byte[writer.maxRecordSizeBytes() + 1]; new Random(42).nextBytes(exceedsMaxRecordSize); - Product2 hugeRecord = - new Tuple2(new byte[0], exceedsMaxRecordSize); - try { - // Here, we write through the public `write()` interface instead of the test-only - // `insertRecordIntoSorter` interface: - writer.write(Collections.singletonList(hugeRecord).iterator()); - fail("Expected exception to be thrown"); - } catch (IOException e) { - // Pass - } + dataToWrite.add(new Tuple2(3, ByteBuffer.wrap(exceedsMaxRecordSize))); + writer.write(dataToWrite.iterator()); + writer.stop(true); + assertEquals( + HashMultiset.create(dataToWrite), + HashMultiset.create(readRecordsFromFile())); assertSpillFilesWereCleanedUp(); } @@ -542,4 +504,58 @@ public void spillFilesAreDeletedWhenStoppingAfterError() throws IOException { writer.stop(false); assertSpillFilesWereCleanedUp(); } + + @Test + public void testPeakMemoryUsed() throws Exception { + final long recordLengthBytes = 8; + final long pageSizeBytes = 256; + final long numRecordsPerPage = pageSizeBytes / recordLengthBytes; + when(shuffleMemoryManager.pageSizeBytes()).thenReturn(pageSizeBytes); + final UnsafeShuffleWriter writer = + new UnsafeShuffleWriter( + blockManager, + shuffleBlockResolver, + taskMemoryManager, + shuffleMemoryManager, + new UnsafeShuffleHandle<>(0, 1, shuffleDep), + 0, // map id + taskContext, + conf); + + // Peak memory should be monotonically increasing. More specifically, every time + // we allocate a new page it should increase by exactly the size of the page. + long previousPeakMemory = writer.getPeakMemoryUsedBytes(); + long newPeakMemory; + try { + for (int i = 0; i < numRecordsPerPage * 10; i++) { + writer.insertRecordIntoSorter(new Tuple2(1, 1)); + newPeakMemory = writer.getPeakMemoryUsedBytes(); + if (i % numRecordsPerPage == 0 && i != 0) { + // The first page is allocated in constructor, another page will be allocated after + // every numRecordsPerPage records (peak memory should change). + assertEquals(previousPeakMemory + pageSizeBytes, newPeakMemory); + } else { + assertEquals(previousPeakMemory, newPeakMemory); + } + previousPeakMemory = newPeakMemory; + } + + // Spilling should not change peak memory + writer.forceSorterToSpill(); + newPeakMemory = writer.getPeakMemoryUsedBytes(); + assertEquals(previousPeakMemory, newPeakMemory); + for (int i = 0; i < numRecordsPerPage; i++) { + writer.insertRecordIntoSorter(new Tuple2(1, 1)); + } + newPeakMemory = writer.getPeakMemoryUsedBytes(); + assertEquals(previousPeakMemory, newPeakMemory); + + // Closing the writer should not change peak memory + writer.closeAndWriteOutput(); + newPeakMemory = writer.getPeakMemoryUsedBytes(); + assertEquals(previousPeakMemory, newPeakMemory); + } finally { + writer.stop(false); + } + } } diff --git a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java index dbb7c662d787..ab480b60adae 100644 --- a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java +++ b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java @@ -25,15 +25,14 @@ import org.mockito.invocation.InvocationOnMock; import org.mockito.stubbing.Answer; import static org.hamcrest.Matchers.greaterThan; +import static org.junit.Assert.*; import static org.mockito.AdditionalMatchers.geq; import static org.mockito.Mockito.*; import org.apache.spark.shuffle.ShuffleMemoryManager; import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.memory.*; -import org.apache.spark.unsafe.PlatformDependent; -import static org.apache.spark.unsafe.PlatformDependent.BYTE_ARRAY_OFFSET; -import static org.apache.spark.unsafe.PlatformDependent.LONG_ARRAY_OFFSET; +import org.apache.spark.unsafe.Platform; public abstract class AbstractBytesToBytesMapSuite { @@ -47,7 +46,7 @@ public abstract class AbstractBytesToBytesMapSuite { @Before public void setup() { - shuffleMemoryManager = new ShuffleMemoryManager(Long.MAX_VALUE); + shuffleMemoryManager = ShuffleMemoryManager.create(Long.MAX_VALUE, PAGE_SIZE_BYTES); taskMemoryManager = new TaskMemoryManager(new ExecutorMemoryManager(getMemoryAllocator())); // Mocked memory manager for tests that check the maximum array size, since actually allocating // such large arrays will cause us to run out of memory in our tests. @@ -79,13 +78,8 @@ public void tearDown() { private static byte[] getByteArray(MemoryLocation loc, int size) { final byte[] arr = new byte[size]; - PlatformDependent.copyMemory( - loc.getBaseObject(), - loc.getBaseOffset(), - arr, - BYTE_ARRAY_OFFSET, - size - ); + Platform.copyMemory( + loc.getBaseObject(), loc.getBaseOffset(), arr, Platform.BYTE_ARRAY_OFFSET, size); return arr; } @@ -107,7 +101,7 @@ private static boolean arrayEquals( long actualLengthBytes) { return (actualLengthBytes == expected.length) && ByteArrayMethods.arrayEquals( expected, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, actualAddr.getBaseObject(), actualAddr.getBaseOffset(), expected.length @@ -123,7 +117,7 @@ public void emptyMap() { final int keyLengthInWords = 10; final int keyLengthInBytes = keyLengthInWords * 8; final byte[] key = getRandomByteArray(keyLengthInWords); - Assert.assertFalse(map.lookup(key, BYTE_ARRAY_OFFSET, keyLengthInBytes).isDefined()); + Assert.assertFalse(map.lookup(key, Platform.BYTE_ARRAY_OFFSET, keyLengthInBytes).isDefined()); Assert.assertFalse(map.iterator().hasNext()); } finally { map.free(); @@ -140,14 +134,14 @@ public void setAndRetrieveAKey() { final byte[] valueData = getRandomByteArray(recordLengthWords); try { final BytesToBytesMap.Location loc = - map.lookup(keyData, BYTE_ARRAY_OFFSET, recordLengthBytes); + map.lookup(keyData, Platform.BYTE_ARRAY_OFFSET, recordLengthBytes); Assert.assertFalse(loc.isDefined()); Assert.assertTrue(loc.putNewKey( keyData, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, recordLengthBytes, valueData, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, recordLengthBytes )); // After storing the key and value, the other location methods should return results that @@ -158,7 +152,8 @@ public void setAndRetrieveAKey() { Assert.assertArrayEquals(valueData, getByteArray(loc.getValueAddress(), recordLengthBytes)); // After calling lookup() the location should still point to the correct data. - Assert.assertTrue(map.lookup(keyData, BYTE_ARRAY_OFFSET, recordLengthBytes).isDefined()); + Assert.assertTrue( + map.lookup(keyData, Platform.BYTE_ARRAY_OFFSET, recordLengthBytes).isDefined()); Assert.assertEquals(recordLengthBytes, loc.getKeyLength()); Assert.assertEquals(recordLengthBytes, loc.getValueLength()); Assert.assertArrayEquals(keyData, getByteArray(loc.getKeyAddress(), recordLengthBytes)); @@ -167,10 +162,10 @@ public void setAndRetrieveAKey() { try { Assert.assertTrue(loc.putNewKey( keyData, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, recordLengthBytes, valueData, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, recordLengthBytes )); Assert.fail("Should not be able to set a new value for a key"); @@ -182,8 +177,7 @@ public void setAndRetrieveAKey() { } } - @Test - public void iteratorTest() throws Exception { + private void iteratorTestBase(boolean destructive) throws Exception { final int size = 4096; BytesToBytesMap map = new BytesToBytesMap( taskMemoryManager, shuffleMemoryManager, size / 2, PAGE_SIZE_BYTES); @@ -191,47 +185,64 @@ public void iteratorTest() throws Exception { for (long i = 0; i < size; i++) { final long[] value = new long[] { i }; final BytesToBytesMap.Location loc = - map.lookup(value, PlatformDependent.LONG_ARRAY_OFFSET, 8); + map.lookup(value, Platform.LONG_ARRAY_OFFSET, 8); Assert.assertFalse(loc.isDefined()); // Ensure that we store some zero-length keys if (i % 5 == 0) { Assert.assertTrue(loc.putNewKey( null, - PlatformDependent.LONG_ARRAY_OFFSET, + Platform.LONG_ARRAY_OFFSET, 0, value, - PlatformDependent.LONG_ARRAY_OFFSET, + Platform.LONG_ARRAY_OFFSET, 8 )); } else { Assert.assertTrue(loc.putNewKey( value, - PlatformDependent.LONG_ARRAY_OFFSET, + Platform.LONG_ARRAY_OFFSET, 8, value, - PlatformDependent.LONG_ARRAY_OFFSET, + Platform.LONG_ARRAY_OFFSET, 8 )); } } final java.util.BitSet valuesSeen = new java.util.BitSet(size); - final Iterator iter = map.iterator(); + final Iterator iter; + if (destructive) { + iter = map.destructiveIterator(); + } else { + iter = map.iterator(); + } + int numPages = map.getNumDataPages(); + int countFreedPages = 0; while (iter.hasNext()) { final BytesToBytesMap.Location loc = iter.next(); Assert.assertTrue(loc.isDefined()); final MemoryLocation keyAddress = loc.getKeyAddress(); final MemoryLocation valueAddress = loc.getValueAddress(); - final long value = PlatformDependent.UNSAFE.getLong( + final long value = Platform.getLong( valueAddress.getBaseObject(), valueAddress.getBaseOffset()); final long keyLength = loc.getKeyLength(); if (keyLength == 0) { Assert.assertTrue("value " + value + " was not divisible by 5", value % 5 == 0); } else { - final long key = PlatformDependent.UNSAFE.getLong( - keyAddress.getBaseObject(), keyAddress.getBaseOffset()); + final long key = Platform.getLong(keyAddress.getBaseObject(), keyAddress.getBaseOffset()); Assert.assertEquals(value, key); } valuesSeen.set((int) value); + if (destructive) { + // The iterator moves onto next page and frees previous page + if (map.getNumDataPages() < numPages) { + numPages = map.getNumDataPages(); + countFreedPages++; + } + } + } + if (destructive) { + // Latest page is not freed by iterator but by map itself + Assert.assertEquals(countFreedPages, numPages - 1); } Assert.assertEquals(size, valuesSeen.cardinality()); } finally { @@ -239,6 +250,16 @@ public void iteratorTest() throws Exception { } } + @Test + public void iteratorTest() throws Exception { + iteratorTestBase(false); + } + + @Test + public void destructiveIteratorTest() throws Exception { + iteratorTestBase(true); + } + @Test public void iteratingOverDataPagesWithWastedSpace() throws Exception { final int NUM_ENTRIES = 1000 * 1000; @@ -256,16 +277,16 @@ public void iteratingOverDataPagesWithWastedSpace() throws Exception { final long[] value = new long[] { i, i, i, i, i }; // 5 * 8 = 40 bytes final BytesToBytesMap.Location loc = map.lookup( key, - LONG_ARRAY_OFFSET, + Platform.LONG_ARRAY_OFFSET, KEY_LENGTH ); Assert.assertFalse(loc.isDefined()); Assert.assertTrue(loc.putNewKey( key, - LONG_ARRAY_OFFSET, + Platform.LONG_ARRAY_OFFSET, KEY_LENGTH, value, - LONG_ARRAY_OFFSET, + Platform.LONG_ARRAY_OFFSET, VALUE_LENGTH )); } @@ -280,18 +301,18 @@ public void iteratingOverDataPagesWithWastedSpace() throws Exception { Assert.assertTrue(loc.isDefined()); Assert.assertEquals(KEY_LENGTH, loc.getKeyLength()); Assert.assertEquals(VALUE_LENGTH, loc.getValueLength()); - PlatformDependent.copyMemory( + Platform.copyMemory( loc.getKeyAddress().getBaseObject(), loc.getKeyAddress().getBaseOffset(), key, - LONG_ARRAY_OFFSET, + Platform.LONG_ARRAY_OFFSET, KEY_LENGTH ); - PlatformDependent.copyMemory( + Platform.copyMemory( loc.getValueAddress().getBaseObject(), loc.getValueAddress().getBaseOffset(), value, - LONG_ARRAY_OFFSET, + Platform.LONG_ARRAY_OFFSET, VALUE_LENGTH ); for (long j : key) { @@ -326,16 +347,16 @@ public void randomizedStressTest() { expected.put(ByteBuffer.wrap(key), value); final BytesToBytesMap.Location loc = map.lookup( key, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, key.length ); Assert.assertFalse(loc.isDefined()); Assert.assertTrue(loc.putNewKey( key, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, key.length, value, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, value.length )); // After calling putNewKey, the following should be true, even before calling @@ -351,7 +372,8 @@ public void randomizedStressTest() { for (Map.Entry entry : expected.entrySet()) { final byte[] key = entry.getKey().array(); final byte[] value = entry.getValue(); - final BytesToBytesMap.Location loc = map.lookup(key, BYTE_ARRAY_OFFSET, key.length); + final BytesToBytesMap.Location loc = + map.lookup(key, Platform.BYTE_ARRAY_OFFSET, key.length); Assert.assertTrue(loc.isDefined()); Assert.assertTrue(arrayEquals(key, loc.getKeyAddress(), loc.getKeyLength())); Assert.assertTrue(arrayEquals(value, loc.getValueAddress(), loc.getValueLength())); @@ -377,16 +399,16 @@ public void randomizedTestWithRecordsLargerThanPageSize() { expected.put(ByteBuffer.wrap(key), value); final BytesToBytesMap.Location loc = map.lookup( key, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, key.length ); Assert.assertFalse(loc.isDefined()); Assert.assertTrue(loc.putNewKey( key, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, key.length, value, - BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, value.length )); // After calling putNewKey, the following should be true, even before calling @@ -401,7 +423,8 @@ public void randomizedTestWithRecordsLargerThanPageSize() { for (Map.Entry entry : expected.entrySet()) { final byte[] key = entry.getKey().array(); final byte[] value = entry.getValue(); - final BytesToBytesMap.Location loc = map.lookup(key, BYTE_ARRAY_OFFSET, key.length); + final BytesToBytesMap.Location loc = + map.lookup(key, Platform.BYTE_ARRAY_OFFSET, key.length); Assert.assertTrue(loc.isDefined()); Assert.assertTrue(arrayEquals(key, loc.getKeyAddress(), loc.getKeyLength())); Assert.assertTrue(arrayEquals(value, loc.getValueAddress(), loc.getValueLength())); @@ -413,18 +436,16 @@ public void randomizedTestWithRecordsLargerThanPageSize() { @Test public void failureToAllocateFirstPage() { - shuffleMemoryManager = new ShuffleMemoryManager(1024); + shuffleMemoryManager = ShuffleMemoryManager.createForTesting(1024); BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, shuffleMemoryManager, 1, PAGE_SIZE_BYTES); try { final long[] emptyArray = new long[0]; final BytesToBytesMap.Location loc = - map.lookup(emptyArray, PlatformDependent.LONG_ARRAY_OFFSET, 0); + map.lookup(emptyArray, Platform.LONG_ARRAY_OFFSET, 0); Assert.assertFalse(loc.isDefined()); Assert.assertFalse(loc.putNewKey( - emptyArray, LONG_ARRAY_OFFSET, 0, - emptyArray, LONG_ARRAY_OFFSET, 0 - )); + emptyArray, Platform.LONG_ARRAY_OFFSET, 0, emptyArray, Platform.LONG_ARRAY_OFFSET, 0)); } finally { map.free(); } @@ -433,15 +454,16 @@ public void failureToAllocateFirstPage() { @Test public void failureToGrow() { - shuffleMemoryManager = new ShuffleMemoryManager(1024 * 10); + shuffleMemoryManager = ShuffleMemoryManager.createForTesting(1024 * 10); BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, shuffleMemoryManager, 1, 1024); try { boolean success = true; int i; for (i = 0; i < 1024; i++) { final long[] arr = new long[]{i}; - final BytesToBytesMap.Location loc = map.lookup(arr, PlatformDependent.LONG_ARRAY_OFFSET, 8); - success = loc.putNewKey(arr, LONG_ARRAY_OFFSET, 8, arr, LONG_ARRAY_OFFSET, 8); + final BytesToBytesMap.Location loc = map.lookup(arr, Platform.LONG_ARRAY_OFFSET, 8); + success = + loc.putNewKey(arr, Platform.LONG_ARRAY_OFFSET, 8, arr, Platform.LONG_ARRAY_OFFSET, 8); if (!success) { break; } @@ -495,4 +517,57 @@ public void resizingLargeMap() { map.growAndRehash(); map.free(); } + + @Test + public void testPeakMemoryUsed() { + final long recordLengthBytes = 24; + final long pageSizeBytes = 256 + 8; // 8 bytes for end-of-page marker + final long numRecordsPerPage = (pageSizeBytes - 8) / recordLengthBytes; + final BytesToBytesMap map = new BytesToBytesMap( + taskMemoryManager, shuffleMemoryManager, 1024, pageSizeBytes); + + // Since BytesToBytesMap is append-only, we expect the total memory consumption to be + // monotonically increasing. More specifically, every time we allocate a new page it + // should increase by exactly the size of the page. In this regard, the memory usage + // at any given time is also the peak memory used. + long previousPeakMemory = map.getPeakMemoryUsedBytes(); + long newPeakMemory; + try { + for (long i = 0; i < numRecordsPerPage * 10; i++) { + final long[] value = new long[]{i}; + map.lookup(value, Platform.LONG_ARRAY_OFFSET, 8).putNewKey( + value, + Platform.LONG_ARRAY_OFFSET, + 8, + value, + Platform.LONG_ARRAY_OFFSET, + 8); + newPeakMemory = map.getPeakMemoryUsedBytes(); + if (i % numRecordsPerPage == 0 && i > 0) { + // We allocated a new page for this record, so peak memory should change + assertEquals(previousPeakMemory + pageSizeBytes, newPeakMemory); + } else { + assertEquals(previousPeakMemory, newPeakMemory); + } + previousPeakMemory = newPeakMemory; + } + + // Freeing the map should not change the peak memory + map.free(); + newPeakMemory = map.getPeakMemoryUsedBytes(); + assertEquals(previousPeakMemory, newPeakMemory); + + } finally { + map.free(); + } + } + + @Test + public void testAcquirePageInConstructor() { + final BytesToBytesMap map = new BytesToBytesMap( + taskMemoryManager, shuffleMemoryManager, 1, PAGE_SIZE_BYTES); + assertEquals(1, map.getNumDataPages()); + map.free(); + } + } diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java index 52fa8bcd57e7..445a37b83e98 100644 --- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java +++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java @@ -18,8 +18,10 @@ package org.apache.spark.util.collection.unsafe.sort; import java.io.File; +import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.util.Arrays; import java.util.LinkedList; import java.util.UUID; @@ -34,6 +36,7 @@ import org.mockito.MockitoAnnotations; import org.mockito.invocation.InvocationOnMock; import org.mockito.stubbing.Answer; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.junit.Assert.*; import static org.mockito.AdditionalAnswers.returnsSecondArg; import static org.mockito.Answers.RETURNS_SMART_NULLS; @@ -46,7 +49,7 @@ import org.apache.spark.serializer.SerializerInstance; import org.apache.spark.shuffle.ShuffleMemoryManager; import org.apache.spark.storage.*; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.memory.ExecutorMemoryManager; import org.apache.spark.unsafe.memory.MemoryAllocator; import org.apache.spark.unsafe.memory.TaskMemoryManager; @@ -77,12 +80,13 @@ public int compare( } }; + SparkConf sparkConf; + File tempDir; ShuffleMemoryManager shuffleMemoryManager; @Mock(answer = RETURNS_SMART_NULLS) BlockManager blockManager; @Mock(answer = RETURNS_SMART_NULLS) DiskBlockManager diskBlockManager; @Mock(answer = RETURNS_SMART_NULLS) TaskContext taskContext; - File tempDir; private final long pageSizeBytes = new SparkConf().getSizeAsBytes("spark.buffer.pageSize", "64m"); @@ -96,8 +100,9 @@ public OutputStream apply(OutputStream stream) { @Before public void setUp() { MockitoAnnotations.initMocks(this); - tempDir = new File(Utils.createTempDir$default$1()); - shuffleMemoryManager = new ShuffleMemoryManager(Long.MAX_VALUE); + sparkConf = new SparkConf(); + tempDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "unsafe-test"); + shuffleMemoryManager = ShuffleMemoryManager.create(Long.MAX_VALUE, pageSizeBytes); spillFilesCreated.clear(); taskContext = mock(TaskContext.class); when(taskContext.taskMetrics()).thenReturn(new TaskMetrics()); @@ -138,13 +143,18 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th @After public void tearDown() { - long leakedUnsafeMemory = taskMemoryManager.cleanUpAllAllocatedMemory(); - if (shuffleMemoryManager != null) { - long leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask(); - shuffleMemoryManager = null; - assertEquals(0L, leakedShuffleMemory); + try { + long leakedUnsafeMemory = taskMemoryManager.cleanUpAllAllocatedMemory(); + if (shuffleMemoryManager != null) { + long leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask(); + shuffleMemoryManager = null; + assertEquals(0L, leakedShuffleMemory); + } + assertEquals(0, leakedUnsafeMemory); + } finally { + Utils.deleteRecursively(tempDir); + tempDir = null; } - assertEquals(0, leakedUnsafeMemory); } private void assertSpillFilesWereCleanedUp() { @@ -155,14 +165,19 @@ private void assertSpillFilesWereCleanedUp() { } private static void insertNumber(UnsafeExternalSorter sorter, int value) throws Exception { - final int[] arr = new int[] { value }; - sorter.insertRecord(arr, PlatformDependent.INT_ARRAY_OFFSET, 4, value); + final int[] arr = new int[]{ value }; + sorter.insertRecord(arr, Platform.INT_ARRAY_OFFSET, 4, value); } - @Test - public void testSortingOnlyByPrefix() throws Exception { + private static void insertRecord( + UnsafeExternalSorter sorter, + int[] record, + long prefix) throws IOException { + sorter.insertRecord(record, Platform.INT_ARRAY_OFFSET, record.length * 4, prefix); + } - final UnsafeExternalSorter sorter = UnsafeExternalSorter.create( + private UnsafeExternalSorter newSorter() throws IOException { + return UnsafeExternalSorter.create( taskMemoryManager, shuffleMemoryManager, blockManager, @@ -171,7 +186,11 @@ public void testSortingOnlyByPrefix() throws Exception { prefixComparator, /* initialSize */ 1024, pageSizeBytes); + } + @Test + public void testSortingOnlyByPrefix() throws Exception { + final UnsafeExternalSorter sorter = newSorter(); insertNumber(sorter, 5); insertNumber(sorter, 1); insertNumber(sorter, 3); @@ -186,26 +205,16 @@ public void testSortingOnlyByPrefix() throws Exception { iter.loadNext(); assertEquals(i, iter.getKeyPrefix()); assertEquals(4, iter.getRecordLength()); - // TODO: read rest of value. + assertEquals(i, Platform.getInt(iter.getBaseObject(), iter.getBaseOffset())); } - sorter.freeMemory(); + sorter.cleanupResources(); assertSpillFilesWereCleanedUp(); } @Test public void testSortingEmptyArrays() throws Exception { - - final UnsafeExternalSorter sorter = UnsafeExternalSorter.create( - taskMemoryManager, - shuffleMemoryManager, - blockManager, - taskContext, - recordComparator, - prefixComparator, - /* initialSize */ 1024, - pageSizeBytes); - + final UnsafeExternalSorter sorter = newSorter(); sorter.insertRecord(null, 0, 0, 0); sorter.insertRecord(null, 0, 0, 0); sorter.spill(); @@ -222,13 +231,97 @@ public void testSortingEmptyArrays() throws Exception { assertEquals(0, iter.getRecordLength()); } - sorter.freeMemory(); + sorter.cleanupResources(); + assertSpillFilesWereCleanedUp(); + } + + @Test + public void spillingOccursInResponseToMemoryPressure() throws Exception { + shuffleMemoryManager = ShuffleMemoryManager.create(pageSizeBytes * 2, pageSizeBytes); + final UnsafeExternalSorter sorter = newSorter(); + final int numRecords = (int) pageSizeBytes / 4; + for (int i = 0; i <= numRecords; i++) { + insertNumber(sorter, numRecords - i); + } + // Ensure that spill files were created + assertThat(tempDir.listFiles().length, greaterThanOrEqualTo(1)); + // Read back the sorted data: + UnsafeSorterIterator iter = sorter.getSortedIterator(); + + int i = 0; + while (iter.hasNext()) { + iter.loadNext(); + assertEquals(i, iter.getKeyPrefix()); + assertEquals(4, iter.getRecordLength()); + assertEquals(i, Platform.getInt(iter.getBaseObject(), iter.getBaseOffset())); + i++; + } + sorter.cleanupResources(); assertSpillFilesWereCleanedUp(); } @Test public void testFillingPage() throws Exception { + final UnsafeExternalSorter sorter = newSorter(); + byte[] record = new byte[16]; + while (sorter.getNumberOfAllocatedPages() < 2) { + sorter.insertRecord(record, Platform.BYTE_ARRAY_OFFSET, record.length, 0); + } + sorter.cleanupResources(); + assertSpillFilesWereCleanedUp(); + } + @Test + public void sortingRecordsThatExceedPageSize() throws Exception { + final UnsafeExternalSorter sorter = newSorter(); + final int[] largeRecord = new int[(int) pageSizeBytes + 16]; + Arrays.fill(largeRecord, 456); + final int[] smallRecord = new int[100]; + Arrays.fill(smallRecord, 123); + + insertRecord(sorter, largeRecord, 456); + sorter.spill(); + insertRecord(sorter, smallRecord, 123); + sorter.spill(); + insertRecord(sorter, smallRecord, 123); + insertRecord(sorter, largeRecord, 456); + + UnsafeSorterIterator iter = sorter.getSortedIterator(); + // Small record + assertTrue(iter.hasNext()); + iter.loadNext(); + assertEquals(123, iter.getKeyPrefix()); + assertEquals(smallRecord.length * 4, iter.getRecordLength()); + assertEquals(123, Platform.getInt(iter.getBaseObject(), iter.getBaseOffset())); + // Small record + assertTrue(iter.hasNext()); + iter.loadNext(); + assertEquals(123, iter.getKeyPrefix()); + assertEquals(smallRecord.length * 4, iter.getRecordLength()); + assertEquals(123, Platform.getInt(iter.getBaseObject(), iter.getBaseOffset())); + // Large record + assertTrue(iter.hasNext()); + iter.loadNext(); + assertEquals(456, iter.getKeyPrefix()); + assertEquals(largeRecord.length * 4, iter.getRecordLength()); + assertEquals(456, Platform.getInt(iter.getBaseObject(), iter.getBaseOffset())); + // Large record + assertTrue(iter.hasNext()); + iter.loadNext(); + assertEquals(456, iter.getKeyPrefix()); + assertEquals(largeRecord.length * 4, iter.getRecordLength()); + assertEquals(456, Platform.getInt(iter.getBaseObject(), iter.getBaseOffset())); + + assertFalse(iter.hasNext()); + sorter.cleanupResources(); + assertSpillFilesWereCleanedUp(); + } + + @Test + public void testPeakMemoryUsed() throws Exception { + final long recordLengthBytes = 8; + final long pageSizeBytes = 256; + final long numRecordsPerPage = pageSizeBytes / recordLengthBytes; final UnsafeExternalSorter sorter = UnsafeExternalSorter.create( taskMemoryManager, shuffleMemoryManager, @@ -236,15 +329,57 @@ public void testFillingPage() throws Exception { taskContext, recordComparator, prefixComparator, - /* initialSize */ 1024, + 1024, pageSizeBytes); - byte[] record = new byte[16]; - while (sorter.getNumberOfAllocatedPages() < 2) { - sorter.insertRecord(record, PlatformDependent.BYTE_ARRAY_OFFSET, record.length, 0); + // Peak memory should be monotonically increasing. More specifically, every time + // we allocate a new page it should increase by exactly the size of the page. + long previousPeakMemory = sorter.getPeakMemoryUsedBytes(); + long newPeakMemory; + try { + for (int i = 0; i < numRecordsPerPage * 10; i++) { + insertNumber(sorter, i); + newPeakMemory = sorter.getPeakMemoryUsedBytes(); + // The first page is pre-allocated on instantiation + if (i % numRecordsPerPage == 0 && i > 0) { + // We allocated a new page for this record, so peak memory should change + assertEquals(previousPeakMemory + pageSizeBytes, newPeakMemory); + } else { + assertEquals(previousPeakMemory, newPeakMemory); + } + previousPeakMemory = newPeakMemory; + } + + // Spilling should not change peak memory + sorter.spill(); + newPeakMemory = sorter.getPeakMemoryUsedBytes(); + assertEquals(previousPeakMemory, newPeakMemory); + for (int i = 0; i < numRecordsPerPage; i++) { + insertNumber(sorter, i); + } + newPeakMemory = sorter.getPeakMemoryUsedBytes(); + assertEquals(previousPeakMemory, newPeakMemory); + } finally { + sorter.cleanupResources(); + assertSpillFilesWereCleanedUp(); + } + } + + @Test + public void testReservePageOnInstantiation() throws Exception { + final UnsafeExternalSorter sorter = newSorter(); + try { + assertEquals(1, sorter.getNumberOfAllocatedPages()); + // Inserting a new record doesn't allocate more memory since we already have a page + long peakMemory = sorter.getPeakMemoryUsedBytes(); + insertNumber(sorter, 100); + assertEquals(peakMemory, sorter.getPeakMemoryUsedBytes()); + assertEquals(1, sorter.getNumberOfAllocatedPages()); + } finally { + sorter.cleanupResources(); + assertSpillFilesWereCleanedUp(); } - sorter.freeMemory(); - assertSpillFilesWereCleanedUp(); } } + diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java index 909500930539..778e813df6b5 100644 --- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java +++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java @@ -26,7 +26,7 @@ import static org.mockito.Mockito.mock; import org.apache.spark.HashPartitioner; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.memory.ExecutorMemoryManager; import org.apache.spark.unsafe.memory.MemoryAllocator; import org.apache.spark.unsafe.memory.MemoryBlock; @@ -36,11 +36,7 @@ public class UnsafeInMemorySorterSuite { private static String getStringFromDataPage(Object baseObject, long baseOffset, int length) { final byte[] strBytes = new byte[length]; - PlatformDependent.copyMemory( - baseObject, - baseOffset, - strBytes, - PlatformDependent.BYTE_ARRAY_OFFSET, length); + Platform.copyMemory(baseObject, baseOffset, strBytes, Platform.BYTE_ARRAY_OFFSET, length); return new String(strBytes); } @@ -76,14 +72,10 @@ public void testSortingOnlyByIntegerPrefix() throws Exception { long position = dataPage.getBaseOffset(); for (String str : dataToSort) { final byte[] strBytes = str.getBytes("utf-8"); - PlatformDependent.UNSAFE.putInt(baseObject, position, strBytes.length); + Platform.putInt(baseObject, position, strBytes.length); position += 4; - PlatformDependent.copyMemory( - strBytes, - PlatformDependent.BYTE_ARRAY_OFFSET, - baseObject, - position, - strBytes.length); + Platform.copyMemory( + strBytes, Platform.BYTE_ARRAY_OFFSET, baseObject, position, strBytes.length); position += strBytes.length; } // Since the key fits within the 8-byte prefix, we don't need to do any record comparison, so @@ -113,7 +105,7 @@ public int compare(long prefix1, long prefix2) { position = dataPage.getBaseOffset(); for (int i = 0; i < dataToSort.length; i++) { // position now points to the start of a record (which holds its length). - final int recordLength = PlatformDependent.UNSAFE.getInt(baseObject, position); + final int recordLength = Platform.getInt(baseObject, position); final long address = memoryManager.encodePageNumberAndOffset(dataPage, position); final String str = getStringFromDataPage(baseObject, position + 4, recordLength); final int partitionId = hashPartitioner.getPartition(str); diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala index e942d6579b2f..5b84acf40be4 100644 --- a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala @@ -18,13 +18,17 @@ package org.apache.spark import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer import scala.ref.WeakReference import org.scalatest.Matchers +import org.scalatest.exceptions.TestFailedException +import org.apache.spark.scheduler._ -class AccumulatorSuite extends SparkFunSuite with Matchers with LocalSparkContext { +class AccumulatorSuite extends SparkFunSuite with Matchers with LocalSparkContext { + import InternalAccumulator._ implicit def setAccum[A]: AccumulableParam[mutable.Set[A], A] = new AccumulableParam[mutable.Set[A], A] { @@ -155,4 +159,223 @@ class AccumulatorSuite extends SparkFunSuite with Matchers with LocalSparkContex assert(!Accumulators.originals.get(accId).isDefined) } + test("internal accumulators in TaskContext") { + sc = new SparkContext("local", "test") + val accums = InternalAccumulator.create(sc) + val taskContext = new TaskContextImpl(0, 0, 0, 0, null, null, accums) + val internalMetricsToAccums = taskContext.internalMetricsToAccumulators + val collectedInternalAccums = taskContext.collectInternalAccumulators() + val collectedAccums = taskContext.collectAccumulators() + assert(internalMetricsToAccums.size > 0) + assert(internalMetricsToAccums.values.forall(_.isInternal)) + assert(internalMetricsToAccums.contains(TEST_ACCUMULATOR)) + val testAccum = internalMetricsToAccums(TEST_ACCUMULATOR) + assert(collectedInternalAccums.size === internalMetricsToAccums.size) + assert(collectedInternalAccums.size === collectedAccums.size) + assert(collectedInternalAccums.contains(testAccum.id)) + assert(collectedAccums.contains(testAccum.id)) + } + + test("internal accumulators in a stage") { + val listener = new SaveInfoListener + val numPartitions = 10 + sc = new SparkContext("local", "test") + sc.addSparkListener(listener) + // Have each task add 1 to the internal accumulator + val rdd = sc.parallelize(1 to 100, numPartitions).mapPartitions { iter => + TaskContext.get().internalMetricsToAccumulators(TEST_ACCUMULATOR) += 1 + iter + } + // Register asserts in job completion callback to avoid flakiness + listener.registerJobCompletionCallback { _ => + val stageInfos = listener.getCompletedStageInfos + val taskInfos = listener.getCompletedTaskInfos + assert(stageInfos.size === 1) + assert(taskInfos.size === numPartitions) + // The accumulator values should be merged in the stage + val stageAccum = findAccumulableInfo(stageInfos.head.accumulables.values, TEST_ACCUMULATOR) + assert(stageAccum.value.toLong === numPartitions) + // The accumulator should be updated locally on each task + val taskAccumValues = taskInfos.map { taskInfo => + val taskAccum = findAccumulableInfo(taskInfo.accumulables, TEST_ACCUMULATOR) + assert(taskAccum.update.isDefined) + assert(taskAccum.update.get.toLong === 1) + taskAccum.value.toLong + } + // Each task should keep track of the partial value on the way, i.e. 1, 2, ... numPartitions + assert(taskAccumValues.sorted === (1L to numPartitions).toSeq) + } + rdd.count() + } + + test("internal accumulators in multiple stages") { + val listener = new SaveInfoListener + val numPartitions = 10 + sc = new SparkContext("local", "test") + sc.addSparkListener(listener) + // Each stage creates its own set of internal accumulators so the + // values for the same metric should not be mixed up across stages + val rdd = sc.parallelize(1 to 100, numPartitions) + .map { i => (i, i) } + .mapPartitions { iter => + TaskContext.get().internalMetricsToAccumulators(TEST_ACCUMULATOR) += 1 + iter + } + .reduceByKey { case (x, y) => x + y } + .mapPartitions { iter => + TaskContext.get().internalMetricsToAccumulators(TEST_ACCUMULATOR) += 10 + iter + } + .repartition(numPartitions * 2) + .mapPartitions { iter => + TaskContext.get().internalMetricsToAccumulators(TEST_ACCUMULATOR) += 100 + iter + } + // Register asserts in job completion callback to avoid flakiness + listener.registerJobCompletionCallback { _ => + // We ran 3 stages, and the accumulator values should be distinct + val stageInfos = listener.getCompletedStageInfos + assert(stageInfos.size === 3) + val (firstStageAccum, secondStageAccum, thirdStageAccum) = + (findAccumulableInfo(stageInfos(0).accumulables.values, TEST_ACCUMULATOR), + findAccumulableInfo(stageInfos(1).accumulables.values, TEST_ACCUMULATOR), + findAccumulableInfo(stageInfos(2).accumulables.values, TEST_ACCUMULATOR)) + assert(firstStageAccum.value.toLong === numPartitions) + assert(secondStageAccum.value.toLong === numPartitions * 10) + assert(thirdStageAccum.value.toLong === numPartitions * 2 * 100) + } + rdd.count() + } + + test("internal accumulators in fully resubmitted stages") { + testInternalAccumulatorsWithFailedTasks((i: Int) => true) // fail all tasks + } + + test("internal accumulators in partially resubmitted stages") { + testInternalAccumulatorsWithFailedTasks((i: Int) => i % 2 == 0) // fail a subset + } + + /** + * Return the accumulable info that matches the specified name. + */ + private def findAccumulableInfo( + accums: Iterable[AccumulableInfo], + name: String): AccumulableInfo = { + accums.find { a => a.name == name }.getOrElse { + throw new TestFailedException(s"internal accumulator '$name' not found", 0) + } + } + + /** + * Test whether internal accumulators are merged properly if some tasks fail. + */ + private def testInternalAccumulatorsWithFailedTasks(failCondition: (Int => Boolean)): Unit = { + val listener = new SaveInfoListener + val numPartitions = 10 + val numFailedPartitions = (0 until numPartitions).count(failCondition) + // This says use 1 core and retry tasks up to 2 times + sc = new SparkContext("local[1, 2]", "test") + sc.addSparkListener(listener) + val rdd = sc.parallelize(1 to 100, numPartitions).mapPartitionsWithIndex { case (i, iter) => + val taskContext = TaskContext.get() + taskContext.internalMetricsToAccumulators(TEST_ACCUMULATOR) += 1 + // Fail the first attempts of a subset of the tasks + if (failCondition(i) && taskContext.attemptNumber() == 0) { + throw new Exception("Failing a task intentionally.") + } + iter + } + // Register asserts in job completion callback to avoid flakiness + listener.registerJobCompletionCallback { _ => + val stageInfos = listener.getCompletedStageInfos + val taskInfos = listener.getCompletedTaskInfos + assert(stageInfos.size === 1) + assert(taskInfos.size === numPartitions + numFailedPartitions) + val stageAccum = findAccumulableInfo(stageInfos.head.accumulables.values, TEST_ACCUMULATOR) + // We should not double count values in the merged accumulator + assert(stageAccum.value.toLong === numPartitions) + val taskAccumValues = taskInfos.flatMap { taskInfo => + if (!taskInfo.failed) { + // If a task succeeded, its update value should always be 1 + val taskAccum = findAccumulableInfo(taskInfo.accumulables, TEST_ACCUMULATOR) + assert(taskAccum.update.isDefined) + assert(taskAccum.update.get.toLong === 1) + Some(taskAccum.value.toLong) + } else { + // If a task failed, we should not get its accumulator values + assert(taskInfo.accumulables.isEmpty) + None + } + } + assert(taskAccumValues.sorted === (1L to numPartitions).toSeq) + } + rdd.count() + } + +} + +private[spark] object AccumulatorSuite { + + /** + * Run one or more Spark jobs and verify that the peak execution memory accumulator + * is updated afterwards. + */ + def verifyPeakExecutionMemorySet( + sc: SparkContext, + testName: String)(testBody: => Unit): Unit = { + val listener = new SaveInfoListener + sc.addSparkListener(listener) + // Register asserts in job completion callback to avoid flakiness + listener.registerJobCompletionCallback { jobId => + if (jobId == 0) { + // The first job is a dummy one to verify that the accumulator does not already exist + val accums = listener.getCompletedStageInfos.flatMap(_.accumulables.values) + assert(!accums.exists(_.name == InternalAccumulator.PEAK_EXECUTION_MEMORY)) + } else { + // In the subsequent jobs, verify that peak execution memory is updated + val accum = listener.getCompletedStageInfos + .flatMap(_.accumulables.values) + .find(_.name == InternalAccumulator.PEAK_EXECUTION_MEMORY) + .getOrElse { + throw new TestFailedException( + s"peak execution memory accumulator not set in '$testName'", 0) + } + assert(accum.value.toLong > 0) + } + } + // Run the jobs + sc.parallelize(1 to 10).count() + testBody + } +} + +/** + * A simple listener that keeps track of the TaskInfos and StageInfos of all completed jobs. + */ +private class SaveInfoListener extends SparkListener { + private val completedStageInfos: ArrayBuffer[StageInfo] = new ArrayBuffer[StageInfo] + private val completedTaskInfos: ArrayBuffer[TaskInfo] = new ArrayBuffer[TaskInfo] + private var jobCompletionCallback: (Int => Unit) = null // parameter is job ID + + def getCompletedStageInfos: Seq[StageInfo] = completedStageInfos.toArray.toSeq + def getCompletedTaskInfos: Seq[TaskInfo] = completedTaskInfos.toArray.toSeq + + /** Register a callback to be called on job end. */ + def registerJobCompletionCallback(callback: (Int => Unit)): Unit = { + jobCompletionCallback = callback + } + + override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { + if (jobCompletionCallback != null) { + jobCompletionCallback(jobEnd.jobId) + } + } + + override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { + completedStageInfos += stageCompleted.stageInfo + } + + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { + completedTaskInfos += taskEnd.taskInfo + } } diff --git a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala index 618a5fb24710..cb8bd04e496a 100644 --- a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala @@ -21,7 +21,7 @@ import org.mockito.Mockito._ import org.scalatest.BeforeAndAfter import org.scalatest.mock.MockitoSugar -import org.apache.spark.executor.DataReadMethod +import org.apache.spark.executor.{DataReadMethod, TaskMetrics} import org.apache.spark.rdd.RDD import org.apache.spark.storage._ @@ -65,7 +65,7 @@ class CacheManagerSuite extends SparkFunSuite with LocalSparkContext with Before // in blockManager.put is a losing battle. You have been warned. blockManager = sc.env.blockManager cacheManager = sc.env.cacheManager - val context = new TaskContextImpl(0, 0, 0, 0, null, null) + val context = TaskContext.empty() val computeValue = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY) val getValue = blockManager.get(RDDBlockId(rdd.id, split.index)) assert(computeValue.toList === List(1, 2, 3, 4)) @@ -77,7 +77,7 @@ class CacheManagerSuite extends SparkFunSuite with LocalSparkContext with Before val result = new BlockResult(Array(5, 6, 7).iterator, DataReadMethod.Memory, 12) when(blockManager.get(RDDBlockId(0, 0))).thenReturn(Some(result)) - val context = new TaskContextImpl(0, 0, 0, 0, null, null) + val context = TaskContext.empty() val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY) assert(value.toList === List(5, 6, 7)) } @@ -86,14 +86,14 @@ class CacheManagerSuite extends SparkFunSuite with LocalSparkContext with Before // Local computation should not persist the resulting value, so don't expect a put(). when(blockManager.get(RDDBlockId(0, 0))).thenReturn(None) - val context = new TaskContextImpl(0, 0, 0, 0, null, null, true) + val context = new TaskContextImpl(0, 0, 0, 0, null, null, Seq.empty, runningLocally = true) val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY) assert(value.toList === List(1, 2, 3, 4)) } test("verify task metrics updated correctly") { cacheManager = sc.env.cacheManager - val context = new TaskContextImpl(0, 0, 0, 0, null, null) + val context = TaskContext.empty() cacheManager.getOrCompute(rdd3, split, context, StorageLevel.MEMORY_ONLY) assert(context.taskMetrics.updatedBlocks.getOrElse(Seq()).size === 2) } diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala index 34caca892891..116f027a0f98 100644 --- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala @@ -206,8 +206,8 @@ class ExecutorAllocationManagerSuite val task2Info = createTaskInfo(1, 0, "executor-1") sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, task2Info)) - sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, null, task1Info, null)) - sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, null, task2Info, null)) + sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, Success, task1Info, null)) + sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, Success, task2Info, null)) assert(adjustRequestedExecutors(manager) === -1) } @@ -787,6 +787,24 @@ class ExecutorAllocationManagerSuite Map("host2" -> 1, "host3" -> 2, "host4" -> 1, "host5" -> 2)) } + test("SPARK-8366: maxNumExecutorsNeeded should properly handle failed tasks") { + sc = createSparkContext() + val manager = sc.executorAllocationManager.get + assert(maxNumExecutorsNeeded(manager) === 0) + + sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 1))) + assert(maxNumExecutorsNeeded(manager) === 1) + + val taskInfo = createTaskInfo(1, 1, "executor-1") + sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, taskInfo)) + assert(maxNumExecutorsNeeded(manager) === 1) + + // If the task is failed, we expect it to be resubmitted later. + val taskEndReason = ExceptionFailure(null, null, null, null, null, None) + sc.listenerBus.postToAll(SparkListenerTaskEnd(0, 0, null, taskEndReason, taskInfo, null)) + assert(maxNumExecutorsNeeded(manager) === 1) + } + private def createSparkContext( minExecutors: Int = 1, maxExecutors: Int = 5, diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala index 69cb4b44cf7e..aa50a49c5023 100644 --- a/core/src/test/scala/org/apache/spark/FailureSuite.scala +++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark import org.apache.spark.util.NonSerializable -import java.io.NotSerializableException +import java.io.{IOException, NotSerializableException, ObjectInputStream} // Common state shared by FailureSuite-launched tasks. We use a global object // for this because any local variables used in the task closures will rightfully @@ -166,5 +166,69 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext { assert(thrownDueToMemoryLeak.getMessage.contains("memory leak")) } + // Run a 3-task map job in which task 1 always fails with a exception message that + // depends on the failure number, and check that we get the last failure. + test("last failure cause is sent back to driver") { + sc = new SparkContext("local[1,2]", "test") + val data = sc.makeRDD(1 to 3, 3).map { x => + FailureSuiteState.synchronized { + FailureSuiteState.tasksRun += 1 + if (x == 3) { + FailureSuiteState.tasksFailed += 1 + throw new UserException("oops", + new IllegalArgumentException("failed=" + FailureSuiteState.tasksFailed)) + } + } + x * x + } + val thrown = intercept[SparkException] { + data.collect() + } + FailureSuiteState.synchronized { + assert(FailureSuiteState.tasksRun === 4) + } + assert(thrown.getClass === classOf[SparkException]) + assert(thrown.getCause.getClass === classOf[UserException]) + assert(thrown.getCause.getMessage === "oops") + assert(thrown.getCause.getCause.getClass === classOf[IllegalArgumentException]) + assert(thrown.getCause.getCause.getMessage === "failed=2") + FailureSuiteState.clear() + } + + test("failure cause stacktrace is sent back to driver if exception is not serializable") { + sc = new SparkContext("local", "test") + val thrown = intercept[SparkException] { + sc.makeRDD(1 to 3).foreach { _ => throw new NonSerializableUserException } + } + assert(thrown.getClass === classOf[SparkException]) + assert(thrown.getCause === null) + assert(thrown.getMessage.contains("NonSerializableUserException")) + FailureSuiteState.clear() + } + + test("failure cause stacktrace is sent back to driver if exception is not deserializable") { + sc = new SparkContext("local", "test") + val thrown = intercept[SparkException] { + sc.makeRDD(1 to 3).foreach { _ => throw new NonDeserializableUserException } + } + assert(thrown.getClass === classOf[SparkException]) + assert(thrown.getCause === null) + assert(thrown.getMessage.contains("NonDeserializableUserException")) + FailureSuiteState.clear() + } + // TODO: Need to add tests with shuffle fetch failures. } + +class UserException(message: String, cause: Throwable) + extends RuntimeException(message, cause) + +class NonSerializableUserException extends RuntimeException { + val nonSerializableInstanceVariable = new NonSerializable +} + +class NonDeserializableUserException extends RuntimeException { + private def readObject(in: ObjectInputStream): Unit = { + throw new IOException("Intentional exception during deserialization.") + } +} diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index 5c57940fa5f7..d4f2ea87650a 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -285,4 +285,12 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext { } } + test("No exception when both num-executors and dynamic allocation set.") { + noException should be thrownBy { + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local") + .set("spark.dynamicAllocation.enabled", "true").set("spark.executor.instances", "6")) + assert(sc.executorAllocationManager.isEmpty) + assert(sc.getConf.getInt("spark.executor.instances", 0) === 6) + } + } } diff --git a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala index 48509f0759a3..a96a4ce201c2 100644 --- a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala +++ b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala @@ -119,27 +119,35 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { val nums = sc.parallelize(1 to 2, 2) val sem = new Semaphore(0) ThreadingSuiteState.clear() + var throwable: Option[Throwable] = None for (i <- 0 until 2) { new Thread { override def run() { - val ans = nums.map(number => { - val running = ThreadingSuiteState.runningThreads - running.getAndIncrement() - val time = System.currentTimeMillis() - while (running.get() != 4 && System.currentTimeMillis() < time + 1000) { - Thread.sleep(100) - } - if (running.get() != 4) { - ThreadingSuiteState.failed.set(true) - } - number - }).collect() - assert(ans.toList === List(1, 2)) - sem.release() + try { + val ans = nums.map(number => { + val running = ThreadingSuiteState.runningThreads + running.getAndIncrement() + val time = System.currentTimeMillis() + while (running.get() != 4 && System.currentTimeMillis() < time + 1000) { + Thread.sleep(100) + } + if (running.get() != 4) { + ThreadingSuiteState.failed.set(true) + } + number + }).collect() + assert(ans.toList === List(1, 2)) + } catch { + case t: Throwable => + throwable = Some(t) + } finally { + sem.release() + } } }.start() } sem.acquire(2) + throwable.foreach { t => throw t } if (ThreadingSuiteState.failed.get()) { logError("Waited 1 second without seeing runningThreads = 4 (it was " + ThreadingSuiteState.runningThreads.get() + "); failing test") @@ -150,13 +158,19 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { test("set local properties in different thread") { sc = new SparkContext("local", "test") val sem = new Semaphore(0) - + var throwable: Option[Throwable] = None val threads = (1 to 5).map { i => new Thread() { override def run() { - sc.setLocalProperty("test", i.toString) - assert(sc.getLocalProperty("test") === i.toString) - sem.release() + try { + sc.setLocalProperty("test", i.toString) + assert(sc.getLocalProperty("test") === i.toString) + } catch { + case t: Throwable => + throwable = Some(t) + } finally { + sem.release() + } } } } @@ -164,6 +178,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { threads.foreach(_.start()) sem.acquire(5) + throwable.foreach { t => throw t } assert(sc.getLocalProperty("test") === null) } @@ -171,14 +186,20 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { sc = new SparkContext("local", "test") sc.setLocalProperty("test", "parent") val sem = new Semaphore(0) - + var throwable: Option[Throwable] = None val threads = (1 to 5).map { i => new Thread() { override def run() { - assert(sc.getLocalProperty("test") === "parent") - sc.setLocalProperty("test", i.toString) - assert(sc.getLocalProperty("test") === i.toString) - sem.release() + try { + assert(sc.getLocalProperty("test") === "parent") + sc.setLocalProperty("test", i.toString) + assert(sc.getLocalProperty("test") === i.toString) + } catch { + case t: Throwable => + throwable = Some(t) + } finally { + sem.release() + } } } } @@ -186,6 +207,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { threads.foreach(_.start()) sem.acquire(5) + throwable.foreach { t => throw t } assert(sc.getLocalProperty("test") === "parent") assert(sc.getLocalProperty("Foo") === null) } @@ -194,6 +216,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { val jobStarted = new Semaphore(0) val jobEnded = new Semaphore(0) @volatile var jobResult: JobResult = null + var throwable: Option[Throwable] = None sc = new SparkContext("local", "test") sc.setJobGroup("originalJobGroupId", "description") @@ -210,14 +233,19 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { // Create a new thread which will inherit the current thread's properties val thread = new Thread() { override def run(): Unit = { - assert(sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID) === "originalJobGroupId") - // Sleeps for a total of 10 seconds, but allows cancellation to interrupt the task try { - sc.parallelize(1 to 100).foreach { x => - Thread.sleep(100) + assert(sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID) === "originalJobGroupId") + // Sleeps for a total of 10 seconds, but allows cancellation to interrupt the task + try { + sc.parallelize(1 to 100).foreach { x => + Thread.sleep(100) + } + } catch { + case s: SparkException => // ignored so that we don't print noise in test logs } } catch { - case s: SparkException => // ignored so that we don't print noise in test logs + case t: Throwable => + throwable = Some(t) } } } @@ -230,6 +258,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { // modification of the properties object should not affect the properties of running jobs sc.cancelJobGroup("originalJobGroupId") jobEnded.tryAcquire(10, TimeUnit.SECONDS) + throwable.foreach { t => throw t } assert(jobResult.isInstanceOf[JobFailed]) } } diff --git a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala index 823050b0aabb..d93febcfd23f 100644 --- a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala +++ b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala @@ -19,6 +19,10 @@ package org.apache.spark.deploy import java.io.{File, FileInputStream, FileOutputStream} import java.util.jar.{JarEntry, JarOutputStream} +import java.util.jar.Attributes.Name +import java.util.jar.Manifest + +import scala.collection.mutable.ArrayBuffer import com.google.common.io.{Files, ByteStreams} @@ -35,7 +39,7 @@ private[deploy] object IvyTestUtils { * Create the path for the jar and pom from the maven coordinate. Extension should be `jar` * or `pom`. */ - private def pathFromCoordinate( + private[deploy] def pathFromCoordinate( artifact: MavenCoordinate, prefix: File, ext: String, @@ -52,7 +56,7 @@ private[deploy] object IvyTestUtils { } /** Returns the artifact naming based on standard ivy or maven format. */ - private def artifactName( + private[deploy] def artifactName( artifact: MavenCoordinate, useIvyLayout: Boolean, ext: String = ".jar"): String = { @@ -73,7 +77,7 @@ private[deploy] object IvyTestUtils { } /** Write the contents to a file to the supplied directory. */ - private def writeFile(dir: File, fileName: String, contents: String): File = { + private[deploy] def writeFile(dir: File, fileName: String, contents: String): File = { val outputFile = new File(dir, fileName) val outputStream = new FileOutputStream(outputFile) outputStream.write(contents.toCharArray.map(_.toByte)) @@ -90,6 +94,42 @@ private[deploy] object IvyTestUtils { writeFile(dir, "mylib.py", contents) } + /** Create an example R package that calls the given Java class. */ + private def createRFiles( + dir: File, + className: String, + packageName: String): Seq[(String, File)] = { + val rFilesDir = new File(dir, "R" + File.separator + "pkg") + Files.createParentDirs(new File(rFilesDir, "R" + File.separator + "mylib.R")) + val contents = + s"""myfunc <- function(x) { + | SparkR:::callJStatic("$packageName.$className", "myFunc", x) + |} + """.stripMargin + val source = writeFile(new File(rFilesDir, "R"), "mylib.R", contents) + val description = + """Package: sparkPackageTest + |Type: Package + |Title: Test for building an R package + |Version: 0.1 + |Date: 2015-07-08 + |Author: Burak Yavuz + |Imports: methods, SparkR + |Depends: R (>= 3.1), methods, SparkR + |Suggests: testthat + |Description: Test for building an R package within a jar + |License: Apache License (== 2.0) + |Collate: 'mylib.R' + """.stripMargin + val descFile = writeFile(rFilesDir, "DESCRIPTION", description) + val namespace = + """import(SparkR) + |export("myfunc") + """.stripMargin + val nameFile = writeFile(rFilesDir, "NAMESPACE", namespace) + Seq(("R/pkg/R/mylib.R", source), ("R/pkg/DESCRIPTION", descFile), ("R/pkg/NAMESPACE", nameFile)) + } + /** Create a simple testable Class. */ private def createJavaClass(dir: File, className: String, packageName: String): File = { val contents = @@ -97,17 +137,14 @@ private[deploy] object IvyTestUtils { | |import java.lang.Integer; | - |class $className implements java.io.Serializable { - | - | public $className() {} - | - | public Integer myFunc(Integer x) { + |public class $className implements java.io.Serializable { + | public static Integer myFunc(Integer x) { | return x + 1; | } |} """.stripMargin val sourceFile = - new JavaSourceFromString(new File(dir, className + ".java").getAbsolutePath, contents) + new JavaSourceFromString(new File(dir, className).getAbsolutePath, contents) createCompiledClass(className, dir, sourceFile, Seq.empty) } @@ -199,14 +236,25 @@ private[deploy] object IvyTestUtils { } /** Create the jar for the given maven coordinate, using the supplied files. */ - private def packJar( + private[deploy] def packJar( dir: File, artifact: MavenCoordinate, files: Seq[(String, File)], - useIvyLayout: Boolean): File = { + useIvyLayout: Boolean, + withR: Boolean, + withManifest: Option[Manifest] = None): File = { val jarFile = new File(dir, artifactName(artifact, useIvyLayout)) val jarFileStream = new FileOutputStream(jarFile) - val jarStream = new JarOutputStream(jarFileStream, new java.util.jar.Manifest()) + val manifest = withManifest.getOrElse { + val mani = new Manifest() + if (withR) { + val attr = mani.getMainAttributes + attr.put(Name.MANIFEST_VERSION, "1.0") + attr.put(new Name("Spark-HasRPackage"), "true") + } + mani + } + val jarStream = new JarOutputStream(jarFileStream, manifest) for (file <- files) { val jarEntry = new JarEntry(file._1) @@ -239,7 +287,8 @@ private[deploy] object IvyTestUtils { dependencies: Option[Seq[MavenCoordinate]] = None, tempDir: Option[File] = None, useIvyLayout: Boolean = false, - withPython: Boolean = false): File = { + withPython: Boolean = false, + withR: Boolean = false): File = { // Where the root of the repository exists, and what Ivy will search in val tempPath = tempDir.getOrElse(Files.createTempDir()) // Create directory if it doesn't exist @@ -255,14 +304,16 @@ private[deploy] object IvyTestUtils { val javaClass = createJavaClass(root, className, artifact.groupId) // A tuple of files representation in the jar, and the file val javaFile = (artifact.groupId.replace(".", "/") + "/" + javaClass.getName, javaClass) - val allFiles = - if (withPython) { - val pythonFile = createPythonFile(root) - Seq(javaFile, (pythonFile.getName, pythonFile)) - } else { - Seq(javaFile) - } - val jarFile = packJar(jarPath, artifact, allFiles, useIvyLayout) + val allFiles = ArrayBuffer[(String, File)](javaFile) + if (withPython) { + val pythonFile = createPythonFile(root) + allFiles.append((pythonFile.getName, pythonFile)) + } + if (withR) { + val rFiles = createRFiles(root, className, artifact.groupId) + allFiles.append(rFiles: _*) + } + val jarFile = packJar(jarPath, artifact, allFiles, useIvyLayout, withR) assert(jarFile.exists(), "Problem creating Jar file") val descriptor = createDescriptor(tempPath, artifact, dependencies, useIvyLayout) assert(descriptor.exists(), "Problem creating Pom file") @@ -286,9 +337,10 @@ private[deploy] object IvyTestUtils { dependencies: Option[String], rootDir: Option[File], useIvyLayout: Boolean = false, - withPython: Boolean = false): File = { + withPython: Boolean = false, + withR: Boolean = false): File = { val deps = dependencies.map(SparkSubmitUtils.extractMavenCoordinates) - val mainRepo = createLocalRepository(artifact, deps, rootDir, useIvyLayout, withPython) + val mainRepo = createLocalRepository(artifact, deps, rootDir, useIvyLayout, withPython, withR) deps.foreach { seq => seq.foreach { dep => createLocalRepository(dep, None, Some(mainRepo), useIvyLayout, withPython = false) }} @@ -311,11 +363,12 @@ private[deploy] object IvyTestUtils { rootDir: Option[File], useIvyLayout: Boolean = false, withPython: Boolean = false, + withR: Boolean = false, ivySettings: IvySettings = new IvySettings)(f: String => Unit): Unit = { val deps = dependencies.map(SparkSubmitUtils.extractMavenCoordinates) purgeLocalIvyCache(artifact, deps, ivySettings) val repo = createLocalRepositoryForTests(artifact, dependencies, rootDir, useIvyLayout, - withPython) + withPython, withR) try { f(repo.toURI.toString) } finally { diff --git a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala new file mode 100644 index 000000000000..47a64081e297 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy + +import java.io.{PrintStream, OutputStream, File} +import java.net.URI +import java.util.jar.Attributes.Name +import java.util.jar.{JarFile, Manifest} +import java.util.zip.{ZipEntry, ZipFile} + +import org.scalatest.BeforeAndAfterEach +import scala.collection.JavaConversions._ +import scala.collection.mutable.ArrayBuffer + +import com.google.common.io.Files +import org.apache.commons.io.FileUtils + +import org.apache.spark.SparkFunSuite +import org.apache.spark.api.r.RUtils +import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate + +class RPackageUtilsSuite extends SparkFunSuite with BeforeAndAfterEach { + + private val main = MavenCoordinate("a", "b", "c") + private val dep1 = MavenCoordinate("a", "dep1", "c") + private val dep2 = MavenCoordinate("a", "dep2", "d") + + private def getJarPath(coord: MavenCoordinate, repo: File): File = { + new File(IvyTestUtils.pathFromCoordinate(coord, repo, "jar", useIvyLayout = false), + IvyTestUtils.artifactName(coord, useIvyLayout = false, ".jar")) + } + + private val lineBuffer = ArrayBuffer[String]() + + private val noOpOutputStream = new OutputStream { + def write(b: Int) = {} + } + + /** Simple PrintStream that reads data into a buffer */ + private class BufferPrintStream extends PrintStream(noOpOutputStream) { + // scalastyle:off println + override def println(line: String) { + // scalastyle:on println + lineBuffer += line + } + } + + def beforeAll() { + System.setProperty("spark.testing", "true") + } + + override def beforeEach(): Unit = { + lineBuffer.clear() + } + + test("pick which jars to unpack using the manifest") { + val deps = Seq(dep1, dep2).mkString(",") + IvyTestUtils.withRepository(main, Some(deps), None, withR = true) { repo => + val jars = Seq(main, dep1, dep2).map(c => new JarFile(getJarPath(c, new File(new URI(repo))))) + assert(RPackageUtils.checkManifestForR(jars(0)), "should have R code") + assert(!RPackageUtils.checkManifestForR(jars(1)), "should not have R code") + assert(!RPackageUtils.checkManifestForR(jars(2)), "should not have R code") + } + } + + test("build an R package from a jar end to end") { + assume(RUtils.isRInstalled, "R isn't installed on this machine.") + val deps = Seq(dep1, dep2).mkString(",") + IvyTestUtils.withRepository(main, Some(deps), None, withR = true) { repo => + val jars = Seq(main, dep1, dep2).map { c => + getJarPath(c, new File(new URI(repo))) + }.mkString(",") + RPackageUtils.checkAndBuildRPackage(jars, new BufferPrintStream, verbose = true) + val firstJar = jars.substring(0, jars.indexOf(",")) + val output = lineBuffer.mkString("\n") + assert(output.contains("Building R package")) + assert(output.contains("Extracting")) + assert(output.contains(s"$firstJar contains R source code. Now installing package.")) + assert(output.contains("doesn't contain R source code, skipping...")) + } + } + + test("jars that don't exist are skipped and print warning") { + assume(RUtils.isRInstalled, "R isn't installed on this machine.") + val deps = Seq(dep1, dep2).mkString(",") + IvyTestUtils.withRepository(main, Some(deps), None, withR = true) { repo => + val jars = Seq(main, dep1, dep2).map { c => + getJarPath(c, new File(new URI(repo))) + "dummy" + }.mkString(",") + RPackageUtils.checkAndBuildRPackage(jars, new BufferPrintStream, verbose = true) + val individualJars = jars.split(",") + val output = lineBuffer.mkString("\n") + individualJars.foreach { jarFile => + assert(output.contains(s"$jarFile")) + } + } + } + + test("faulty R package shows documentation") { + assume(RUtils.isRInstalled, "R isn't installed on this machine.") + IvyTestUtils.withRepository(main, None, None) { repo => + val manifest = new Manifest + val attr = manifest.getMainAttributes + attr.put(Name.MANIFEST_VERSION, "1.0") + attr.put(new Name("Spark-HasRPackage"), "true") + val jar = IvyTestUtils.packJar(new File(new URI(repo)), dep1, Nil, + useIvyLayout = false, withR = false, Some(manifest)) + RPackageUtils.checkAndBuildRPackage(jar.getAbsolutePath, new BufferPrintStream, + verbose = true) + val output = lineBuffer.mkString("\n") + assert(output.contains(RPackageUtils.RJarDoc)) + } + } + + test("SparkR zipping works properly") { + val tempDir = Files.createTempDir() + try { + IvyTestUtils.writeFile(tempDir, "test.R", "abc") + val fakeSparkRDir = new File(tempDir, "SparkR") + assert(fakeSparkRDir.mkdirs()) + IvyTestUtils.writeFile(fakeSparkRDir, "abc.R", "abc") + IvyTestUtils.writeFile(fakeSparkRDir, "DESCRIPTION", "abc") + IvyTestUtils.writeFile(tempDir, "package.zip", "abc") // fake zip file :) + val fakePackageDir = new File(tempDir, "packageTest") + assert(fakePackageDir.mkdirs()) + IvyTestUtils.writeFile(fakePackageDir, "def.R", "abc") + IvyTestUtils.writeFile(fakePackageDir, "DESCRIPTION", "abc") + val finalZip = RPackageUtils.zipRLibraries(tempDir, "sparkr.zip") + assert(finalZip.exists()) + val entries = new ZipFile(finalZip).entries().toSeq.map(_.getName) + assert(entries.contains("/test.R")) + assert(entries.contains("/SparkR/abc.R")) + assert(entries.contains("/SparkR/DESCRIPTION")) + assert(!entries.contains("/package.zip")) + assert(entries.contains("/packageTest/def.R")) + assert(entries.contains("/packageTest/DESCRIPTION")) + } finally { + FileUtils.deleteDirectory(tempDir) + } + } +} diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index aa78bfe30974..1110ca6051a4 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -159,7 +159,6 @@ class SparkSubmitSuite childArgsStr should include ("--executor-cores 5") childArgsStr should include ("--arg arg1 --arg arg2") childArgsStr should include ("--queue thequeue") - childArgsStr should include ("--num-executors 6") childArgsStr should include regex ("--jar .*thejar.jar") childArgsStr should include regex ("--addJars .*one.jar,.*two.jar,.*three.jar") childArgsStr should include regex ("--files .*file1.txt,.*file2.txt") @@ -325,6 +324,8 @@ class SparkSubmitSuite "--class", SimpleApplicationTest.getClass.getName.stripSuffix("$"), "--name", "testApp", "--master", "local", + "--conf", "spark.ui.enabled=false", + "--conf", "spark.master.rest.enabled=false", unusedJar.toString) runSparkSubmit(args) } @@ -338,6 +339,8 @@ class SparkSubmitSuite "--class", JarCreationTest.getClass.getName.stripSuffix("$"), "--name", "testApp", "--master", "local-cluster[2,1,1024]", + "--conf", "spark.ui.enabled=false", + "--conf", "spark.master.rest.enabled=false", "--jars", jarsString, unusedJar.toString, "SparkSubmitClassA", "SparkSubmitClassB") runSparkSubmit(args) @@ -356,12 +359,37 @@ class SparkSubmitSuite "--packages", Seq(main, dep).mkString(","), "--repositories", repo, "--conf", "spark.ui.enabled=false", + "--conf", "spark.master.rest.enabled=false", unusedJar.toString, "my.great.lib.MyLib", "my.great.dep.MyLib") runSparkSubmit(args) } } + test("correctly builds R packages included in a jar with --packages") { + // TODO(SPARK-9603): Building a package to $SPARK_HOME/R/lib is unavailable on Jenkins. + // It's hard to write the test in SparkR (because we can't create the repository dynamically) + /* + assume(RUtils.isRInstalled, "R isn't installed on this machine.") + val main = MavenCoordinate("my.great.lib", "mylib", "0.1") + val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) + val rScriptDir = + Seq(sparkHome, "R", "pkg", "inst", "tests", "packageInAJarTest.R").mkString(File.separator) + assert(new File(rScriptDir).exists) + IvyTestUtils.withRepository(main, None, None, withR = true) { repo => + val args = Seq( + "--name", "testApp", + "--master", "local-cluster[2,1,1024]", + "--packages", main.toString, + "--repositories", repo, + "--verbose", + "--conf", "spark.ui.enabled=false", + rScriptDir) + runSparkSubmit(args) + } + */ + } + test("resolves command line argument paths correctly") { val jars = "/jar1,/jar2" // --jars val files = "hdfs:/file1,file2" // --files @@ -477,6 +505,8 @@ class SparkSubmitSuite "--master", "local", "--conf", "spark.driver.extraClassPath=" + systemJar, "--conf", "spark.driver.userClassPathFirst=true", + "--conf", "spark.ui.enabled=false", + "--conf", "spark.master.rest.enabled=false", userJar.toString) runSparkSubmit(args) } diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala index 01ece1a10f46..63c346c1b890 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala @@ -95,6 +95,25 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll { assert(md.getDependencies.length === 2) } + test("excludes works correctly") { + val md = SparkSubmitUtils.getModuleDescriptor + val excludes = Seq("a:b", "c:d") + excludes.foreach { e => + md.addExcludeRule(SparkSubmitUtils.createExclusion(e + ":*", new IvySettings, "default")) + } + val rules = md.getAllExcludeRules + assert(rules.length === 2) + val rule1 = rules(0).getId.getModuleId + assert(rule1.getOrganisation === "a") + assert(rule1.getName === "b") + val rule2 = rules(1).getId.getModuleId + assert(rule2.getOrganisation === "c") + assert(rule2.getName === "d") + intercept[IllegalArgumentException] { + SparkSubmitUtils.createExclusion("e:f:g:h", new IvySettings, "default") + } + } + test("ivy path works correctly") { val md = SparkSubmitUtils.getModuleDescriptor val artifacts = for (i <- 0 until 3) yield new MDArtifact(md, s"jar-$i", "jar", "jar") @@ -168,4 +187,15 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll { assert(files.indexOf(main.artifactId) >= 0, "Did not return artifact") } } + + test("exclude dependencies end to end") { + val main = new MavenCoordinate("my.great.lib", "mylib", "0.1") + val dep = "my.great.dep:mydep:0.5" + IvyTestUtils.withRepository(main, Some(dep), None) { repo => + val files = SparkSubmitUtils.resolveMavenCoordinates(main.toString, + Some(repo), None, Seq("my.great.dep:mydep"), isTest = true) + assert(files.indexOf(main.artifactId) >= 0, "Did not return artifact") + assert(files.indexOf("my.great.dep") < 0, "Returned excluded artifact") + } + } } diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala index 08c41a897a86..1f2a0f0d309c 100644 --- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala @@ -283,6 +283,26 @@ class StandaloneDynamicAllocationSuite assert(master.apps.head.getExecutorLimit === 1000) } + test("kill the same executor twice (SPARK-9795)") { + sc = new SparkContext(appConf) + val appId = sc.applicationId + assert(master.apps.size === 1) + assert(master.apps.head.id === appId) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.getExecutorLimit === Int.MaxValue) + // sync executors between the Master and the driver, needed because + // the driver refuses to kill executors it does not know about + syncExecutors(sc) + // kill the same executor twice + val executors = getExecutorIds(sc) + assert(executors.size === 2) + assert(sc.killExecutor(executors.head)) + assert(sc.killExecutor(executors.head)) + assert(master.apps.head.executors.size === 1) + // The limit should not be lowered twice + assert(master.apps.head.getExecutorLimit === 1) + } + // =============================== // | Utility methods for testing | // =============================== diff --git a/core/src/test/scala/org/apache/spark/deploy/master/CustomRecoveryModeFactory.scala b/core/src/test/scala/org/apache/spark/deploy/master/CustomRecoveryModeFactory.scala index 8c96b0e71dfd..4b86da536768 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/CustomRecoveryModeFactory.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/CustomRecoveryModeFactory.scala @@ -99,7 +99,7 @@ object CustomPersistenceEngine { @volatile var lastInstance: Option[CustomPersistenceEngine] = None } -class CustomLeaderElectionAgent(val masterActor: LeaderElectable) extends LeaderElectionAgent { - masterActor.electedLeader() +class CustomLeaderElectionAgent(val masterInstance: LeaderElectable) extends LeaderElectionAgent { + masterInstance.electedLeader() } diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala index 30780a0da7f8..242bf4b5566e 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala @@ -40,6 +40,7 @@ class MasterSuite extends SparkFunSuite with Matchers with Eventually with Priva conf.set("spark.deploy.recoveryMode", "CUSTOM") conf.set("spark.deploy.recoveryMode.factory", classOf[CustomRecoveryModeFactory].getCanonicalName) + conf.set("spark.master.rest.enabled", "false") val instantiationAttempts = CustomRecoveryModeFactory.instantiationAttempts @@ -93,8 +94,8 @@ class MasterSuite extends SparkFunSuite with Matchers with Eventually with Priva publicAddress = "" ) - val (rpcEnv, uiPort, restPort) = - Master.startRpcEnvAndEndpoint("127.0.0.1", 7077, 8080, conf) + val (rpcEnv, _, _) = + Master.startRpcEnvAndEndpoint("127.0.0.1", 0, 0, conf) try { rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, rpcEnv.address, Master.ENDPOINT_NAME) @@ -151,6 +152,14 @@ class MasterSuite extends SparkFunSuite with Matchers with Eventually with Priva basicScheduling(spreadOut = false) } + test("basic scheduling with more memory - spread out") { + basicSchedulingWithMoreMemory(spreadOut = true) + } + + test("basic scheduling with more memory - no spread out") { + basicSchedulingWithMoreMemory(spreadOut = false) + } + test("scheduling with max cores - spread out") { schedulingWithMaxCores(spreadOut = true) } @@ -214,6 +223,13 @@ class MasterSuite extends SparkFunSuite with Matchers with Eventually with Priva assert(scheduledCores === Array(10, 10, 10)) } + private def basicSchedulingWithMoreMemory(spreadOut: Boolean): Unit = { + val master = makeMaster() + val appInfo = makeAppInfo(3072) + val scheduledCores = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) + assert(scheduledCores === Array(10, 10, 10)) + } + private def schedulingWithMaxCores(spreadOut: Boolean): Unit = { val master = makeMaster() val appInfo1 = makeAppInfo(1024, maxCores = Some(8)) @@ -343,8 +359,8 @@ class MasterSuite extends SparkFunSuite with Matchers with Eventually with Priva private def makeMaster(conf: SparkConf = new SparkConf): Master = { val securityMgr = new SecurityManager(conf) - val rpcEnv = RpcEnv.create(Master.SYSTEM_NAME, "localhost", 7077, conf, securityMgr) - val master = new Master(rpcEnv, rpcEnv.address, 8080, securityMgr, conf) + val rpcEnv = RpcEnv.create(Master.SYSTEM_NAME, "localhost", 0, conf, securityMgr) + val master = new Master(rpcEnv, rpcEnv.address, 0, securityMgr, conf) master } diff --git a/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala index 11e87bd1dd8e..34775577de8a 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala @@ -73,11 +73,11 @@ class PersistenceEngineSuite extends SparkFunSuite { assert(persistenceEngine.read[String]("test_").isEmpty) // Test deserializing objects that contain RpcEndpointRef - val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf)) + val testRpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf)) try { // Create a real endpoint so that we can test RpcEndpointRef deserialization - val workerEndpoint = rpcEnv.setupEndpoint("worker", new RpcEndpoint { - override val rpcEnv: RpcEnv = rpcEnv + val workerEndpoint = testRpcEnv.setupEndpoint("worker", new RpcEndpoint { + override val rpcEnv: RpcEnv = testRpcEnv }) val workerToPersist = new WorkerInfo( @@ -93,7 +93,8 @@ class PersistenceEngineSuite extends SparkFunSuite { persistenceEngine.addWorker(workerToPersist) - val (storedApps, storedDrivers, storedWorkers) = persistenceEngine.readPersistedData(rpcEnv) + val (storedApps, storedDrivers, storedWorkers) = + persistenceEngine.readPersistedData(testRpcEnv) assert(storedApps.isEmpty) assert(storedDrivers.isEmpty) @@ -110,8 +111,8 @@ class PersistenceEngineSuite extends SparkFunSuite { assert(workerToPersist.webUiPort === recoveryWorkerInfo.webUiPort) assert(workerToPersist.publicAddress === recoveryWorkerInfo.publicAddress) } finally { - rpcEnv.shutdown() - rpcEnv.awaitTermination() + testRpcEnv.shutdown() + testRpcEnv.awaitTermination() } } diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala index cd24d7942331..e9034e39a715 100644 --- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala @@ -38,12 +38,11 @@ class WorkerWatcherSuite extends SparkFunSuite { val conf = new SparkConf() val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf)) val targetWorkerUrl = rpcEnv.uriOf("test", RpcAddress("1.2.3.4", 1234), "Worker") - val otherAddress = "akka://test@4.3.2.1:1234/user/OtherActor" - val otherAkkaAddress = RpcAddress("4.3.2.1", 1234) + val otherRpcAddress = RpcAddress("4.3.2.1", 1234) val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl) workerWatcher.setTesting(testing = true) rpcEnv.setupEndpoint("worker-watcher", workerWatcher) - workerWatcher.onDisconnected(otherAkkaAddress) + workerWatcher.onDisconnected(otherRpcAddress) assert(!workerWatcher.isShutDown) rpcEnv.shutdown() } diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala index d3218a548efc..44eb5a046912 100644 --- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala +++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala @@ -286,6 +286,10 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext private def runAndReturnMetrics(job: => Unit, collector: (SparkListenerTaskEnd) => Option[Long]): Long = { val taskMetrics = new ArrayBuffer[Long]() + + // Avoid receiving earlier taskEnd events + sc.listenerBus.waitUntilEmpty(500) + sc.addSparkListener(new SparkListener() { override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { collector(taskEnd).foreach(taskMetrics += _) diff --git a/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala new file mode 100644 index 000000000000..e281e817e493 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.rdd + +import scala.collection.mutable + +import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite, TaskContext} + +class MapPartitionsWithPreparationRDDSuite extends SparkFunSuite with LocalSparkContext { + + test("prepare called before parent partition is computed") { + sc = new SparkContext("local", "test") + + // Have the parent partition push a number to the list + val parent = sc.parallelize(1 to 100, 1).mapPartitions { iter => + TestObject.things.append(20) + iter + } + + // Push a different number during the prepare phase + val preparePartition = () => { TestObject.things.append(10) } + + // Push yet another number during the execution phase + val executePartition = ( + taskContext: TaskContext, + partitionIndex: Int, + notUsed: Unit, + parentIterator: Iterator[Int]) => { + TestObject.things.append(30) + TestObject.things.iterator + } + + // Verify that the numbers are pushed in the order expected + val rdd = new MapPartitionsWithPreparationRDD[Int, Int, Unit]( + parent, preparePartition, executePartition) + val result = rdd.collect() + assert(result === Array(10, 20, 30)) + + TestObject.things.clear() + // Zip two of these RDDs, both should be prepared before the parent is executed + val rdd2 = new MapPartitionsWithPreparationRDD[Int, Int, Unit]( + parent, preparePartition, executePartition) + val result2 = rdd.zipPartitions(rdd2)((a, b) => a).collect() + assert(result2 === Array(10, 10, 20, 30, 20, 30)) + } + +} + +private object TestObject { + val things = new mutable.ListBuffer[Int] +} diff --git a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala index 3e8816a4c65b..5f73ec867596 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala @@ -175,7 +175,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext { } val hadoopPart1 = generateFakeHadoopPartition() val pipedRdd = new PipedRDD(nums, "printenv " + varName) - val tContext = new TaskContextImpl(0, 0, 0, 0, null, null) + val tContext = TaskContext.empty() val rddIter = pipedRdd.compute(hadoopPart1, tContext) val arr = rddIter.toArray assert(arr(0) == "/some/path") diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 86dff8fb577d..ed481b1374c4 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -133,11 +133,11 @@ class DAGSchedulerSuite val cacheLocations = new HashMap[(Int, Int), Seq[BlockManagerId]] // stub out BlockManagerMaster.getLocations to use our cacheLocations val blockManagerMaster = new BlockManagerMaster(null, conf, true) { - override def getLocations(blockIds: Array[BlockId]): Seq[Seq[BlockManagerId]] = { + override def getLocations(blockIds: Array[BlockId]): IndexedSeq[Seq[BlockManagerId]] = { blockIds.map { _.asRDDId.map(id => (id.rddId -> id.splitIndex)).flatMap(key => cacheLocations.get(key)). getOrElse(Seq()) - }.toSeq + }.toIndexedSeq } override def removeExecutor(execId: String) { // don't need to propagate to the driver, which we don't have @@ -242,7 +242,7 @@ class DAGSchedulerSuite /** Sends TaskSetFailed to the scheduler. */ private def failed(taskSet: TaskSet, message: String) { - runEvent(TaskSetFailed(taskSet, message)) + runEvent(TaskSetFailed(taskSet, message, None)) } /** Sends JobCancelled to the DAG scheduler. */ @@ -527,6 +527,7 @@ class DAGSchedulerSuite } // The map stage should have been submitted. + sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) assert(countSubmittedMapStageAttempts() === 1) complete(taskSets(0), Seq( @@ -909,7 +910,7 @@ class DAGSchedulerSuite assertDataStructuresEmpty() } - test("reduce tasks should be placed locally with map output") { + ignore("reduce tasks should be placed locally with map output") { // Create an shuffleMapRdd with 1 partition val shuffleMapRdd = new MyRDD(sc, 1, Nil) val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) @@ -926,10 +927,10 @@ class DAGSchedulerSuite assertLocations(reduceTaskSet, Seq(Seq("hostA"))) complete(reduceTaskSet, Seq((Success, 42))) assert(results === Map(0 -> 42)) - assertDataStructuresEmpty + assertDataStructuresEmpty() } - test("reduce task locality preferences should only include machines with largest map outputs") { + ignore("reduce task locality preferences should only include machines with largest map outputs") { val numMapTasks = 4 // Create an shuffleMapRdd with more partitions val shuffleMapRdd = new MyRDD(sc, numMapTasks, Nil) @@ -950,7 +951,29 @@ class DAGSchedulerSuite assertLocations(reduceTaskSet, Seq(hosts)) complete(reduceTaskSet, Seq((Success, 42))) assert(results === Map(0 -> 42)) - assertDataStructuresEmpty + assertDataStructuresEmpty() + } + + test("stages with both narrow and shuffle dependencies use narrow ones for locality") { + // Create an RDD that has both a shuffle dependency and a narrow dependency (e.g. for a join) + val rdd1 = new MyRDD(sc, 1, Nil) + val rdd2 = new MyRDD(sc, 1, Nil, locations = Seq(Seq("hostB"))) + val shuffleDep = new ShuffleDependency(rdd1, null) + val narrowDep = new OneToOneDependency(rdd2) + val shuffleId = shuffleDep.shuffleId + val reduceRdd = new MyRDD(sc, 1, List(shuffleDep, narrowDep)) + submit(reduceRdd, Array(0)) + complete(taskSets(0), Seq( + (Success, makeMapStatus("hostA", 1)))) + assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet === + HashSet(makeBlockManagerId("hostA"))) + + // Reducer should run where RDD 2 has preferences, even though though it also has a shuffle dep + val reduceTaskSet = taskSets(1) + assertLocations(reduceTaskSet, Seq(Seq("hostB"))) + complete(reduceTaskSet, Seq((Success, 42))) + assert(results === Map(0 -> 42)) + assertDataStructuresEmpty() } test("Spark exceptions should include call site in stack trace") { diff --git a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala index b3ca150195a5..f7e16af9d3a9 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala @@ -19,9 +19,11 @@ package org.apache.spark.scheduler import org.apache.spark.TaskContext -class FakeTask(stageId: Int, prefLocs: Seq[TaskLocation] = Nil) extends Task[Int](stageId, 0, 0) { +class FakeTask( + stageId: Int, + prefLocs: Seq[TaskLocation] = Nil) + extends Task[Int](stageId, 0, 0, Seq.empty) { override def runTask(context: TaskContext): Int = 0 - override def preferredLocations: Seq[TaskLocation] = prefLocs } diff --git a/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala b/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala index 383855caefa2..f33324792495 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala @@ -25,7 +25,7 @@ import org.apache.spark.TaskContext * A Task implementation that fails to serialize. */ private[spark] class NotSerializableFakeTask(myId: Int, stageId: Int) - extends Task[Array[Byte]](stageId, 0, 0) { + extends Task[Array[Byte]](stageId, 0, 0, Seq.empty) { override def runTask(context: TaskContext): Array[Byte] = Array.empty[Byte] override def preferredLocations: Seq[TaskLocation] = Seq[TaskLocation]() diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala index 9201d1e1f328..450ab7b9fe92 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala @@ -57,8 +57,9 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark } val closureSerializer = SparkEnv.get.closureSerializer.newInstance() val func = (c: TaskContext, i: Iterator[String]) => i.next() - val task = new ResultTask[String, String](0, 0, - sc.broadcast(closureSerializer.serialize((rdd, func)).array), rdd.partitions(0), Seq(), 0) + val taskBinary = sc.broadcast(closureSerializer.serialize((rdd, func)).array) + val task = new ResultTask[String, String]( + 0, 0, taskBinary, rdd.partitions(0), Seq.empty, 0, Seq.empty) intercept[RuntimeException] { task.run(0, 0, null) } @@ -66,7 +67,7 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark } test("all TaskCompletionListeners should be called even if some fail") { - val context = new TaskContextImpl(0, 0, 0, 0, null, null) + val context = TaskContext.empty() val listener = mock(classOf[TaskCompletionListener]) context.addTaskCompletionListener(_ => throw new Exception("blah")) context.addTaskCompletionListener(listener) diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala index 3abb99c4b2b5..edbdb485c5ea 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala @@ -48,7 +48,10 @@ class FakeDAGScheduler(sc: SparkContext, taskScheduler: FakeTaskScheduler) override def executorLost(execId: String) {} - override def taskSetFailed(taskSet: TaskSet, reason: String) { + override def taskSetFailed( + taskSet: TaskSet, + reason: String, + exception: Option[Throwable]): Unit = { taskScheduler.taskSetsFailed += taskSet.id } } @@ -136,7 +139,7 @@ class FakeTaskScheduler(sc: SparkContext, liveExecutors: (String, String)* /* ex /** * A Task implementation that results in a large serialized task. */ -class LargeTask(stageId: Int) extends Task[Array[Byte]](stageId, 0, 0) { +class LargeTask(stageId: Int) extends Task[Array[Byte]](stageId, 0, 0, Seq.empty) { val randomBuffer = new Array[Byte](TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024) val random = new Random(0) random.nextBytes(randomBuffer) diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala index 5ed30f64d705..75221fab51b4 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.scheduler.cluster.mesos import java.nio.ByteBuffer import java.util import java.util.Collections +import java.util.Arrays import scala.collection.JavaConversions._ import scala.collection.mutable @@ -41,6 +42,38 @@ import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSui class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar { + test("Use configured mesosExecutor.cores for ExecutorInfo") { + val mesosExecutorCores = 3 + val conf = new SparkConf + conf.set("spark.mesos.mesosExecutor.cores", mesosExecutorCores.toString) + + val listenerBus = mock[LiveListenerBus] + listenerBus.post( + SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty))) + + val sc = mock[SparkContext] + when(sc.getSparkHome()).thenReturn(Option("/spark-home")) + + when(sc.conf).thenReturn(conf) + when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String]) + when(sc.executorMemory).thenReturn(100) + when(sc.listenerBus).thenReturn(listenerBus) + val taskScheduler = mock[TaskSchedulerImpl] + when(taskScheduler.CPUS_PER_TASK).thenReturn(2) + + val mesosSchedulerBackend = new MesosSchedulerBackend(taskScheduler, sc, "master") + + val resources = Arrays.asList( + mesosSchedulerBackend.createResource("cpus", 4), + mesosSchedulerBackend.createResource("mem", 1024)) + // uri is null. + val (executorInfo, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id") + val executorResources = executorInfo.getResourcesList + val cpus = executorResources.find(_.getName.equals("cpus")).get.getScalar.getValue + + assert(cpus === mesosExecutorCores) + } + test("check spark-class location correctly") { val conf = new SparkConf conf.set("spark.mesos.executor.home" , "/mesos-home") @@ -262,7 +295,6 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi .setSlaveId(SlaveID.newBuilder().setValue(s"s${id.toString}")) .setHostname(s"host${id.toString}").build() - val mesosOffers = new java.util.ArrayList[Offer] mesosOffers.add(offer) diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala index b354914b6ffd..2eb43b731338 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala @@ -17,10 +17,13 @@ package org.apache.spark.scheduler.cluster.mesos +import scala.language.reflectiveCalls + import org.apache.mesos.Protos.Value import org.mockito.Mockito._ import org.scalatest._ import org.scalatest.mock.MockitoSugar + import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} class MesosSchedulerUtilsSuite extends SparkFunSuite with Matchers with MockitoSugar { diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala index f495b6a03795..6d45b1a101be 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala @@ -24,7 +24,7 @@ import org.mockito.Mockito._ import org.scalatest.concurrent.Timeouts import org.scalatest.time.SpanSugar._ -import org.apache.spark.{SparkFunSuite, TaskContext} +import org.apache.spark.{SparkConf, SparkFunSuite, TaskContext} class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts { @@ -50,7 +50,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts { } test("single task requesting memory") { - val manager = new ShuffleMemoryManager(1000L) + val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L) assert(manager.tryToAcquire(100L) === 100L) assert(manager.tryToAcquire(400L) === 400L) @@ -72,7 +72,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts { // Two threads request 500 bytes first, wait for each other to get it, and then request // 500 more; we should immediately return 0 as both are now at 1 / N - val manager = new ShuffleMemoryManager(1000L) + val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L) class State { var t1Result1 = -1L @@ -124,7 +124,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts { // Two tasks request 250 bytes first, wait for each other to get it, and then request // 500 more; we should only grant 250 bytes to each of them on this second request - val manager = new ShuffleMemoryManager(1000L) + val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L) class State { var t1Result1 = -1L @@ -176,7 +176,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts { // for a bit and releases 250 bytes, which should then be granted to t2. Further requests // by t2 will return false right away because it now has 1 / 2N of the memory. - val manager = new ShuffleMemoryManager(1000L) + val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L) class State { var t1Requested = false @@ -241,7 +241,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts { // t1 grabs 1000 bytes and then waits until t2 is ready to make a request. It sleeps // for a bit and releases all its memory. t2 should now be able to grab all the memory. - val manager = new ShuffleMemoryManager(1000L) + val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L) class State { var t1Requested = false @@ -307,7 +307,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts { } test("tasks should not be granted a negative size") { - val manager = new ShuffleMemoryManager(1000L) + val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L) manager.tryToAcquire(700L) val latch = new CountDownLatch(1) diff --git a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala index db718ecabbdb..05b3afef5b83 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala @@ -138,7 +138,7 @@ class HashShuffleReaderSuite extends SparkFunSuite with LocalSparkContext { shuffleHandle, reduceId, reduceId + 1, - new TaskContextImpl(0, 0, 0, 0, null, null), + TaskContext.empty(), blockManager, mapOutputTracker) diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala index cf8bd8ae6962..828153bdbfc4 100644 --- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala @@ -29,7 +29,7 @@ import org.mockito.invocation.InvocationOnMock import org.mockito.stubbing.Answer import org.scalatest.PrivateMethodTester -import org.apache.spark.{SparkFunSuite, TaskContextImpl} +import org.apache.spark.{SparkFunSuite, TaskContext} import org.apache.spark.network._ import org.apache.spark.network.buffer.ManagedBuffer import org.apache.spark.network.shuffle.BlockFetchingListener @@ -95,7 +95,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT ) val iterator = new ShuffleBlockFetcherIterator( - new TaskContextImpl(0, 0, 0, 0, null, null), + TaskContext.empty(), transfer, blockManager, blocksByAddress, @@ -165,7 +165,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq)) - val taskContext = new TaskContextImpl(0, 0, 0, 0, null, null) + val taskContext = TaskContext.empty() val iterator = new ShuffleBlockFetcherIterator( taskContext, transfer, @@ -227,7 +227,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq)) - val taskContext = new TaskContextImpl(0, 0, 0, 0, null, null) + val taskContext = TaskContext.empty() val iterator = new ShuffleBlockFetcherIterator( taskContext, transfer, diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala new file mode 100644 index 000000000000..86699e7f5695 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ui + +import javax.servlet.http.HttpServletRequest + +import scala.xml.Node + +import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS} + +import org.apache.spark._ +import org.apache.spark.executor.TaskMetrics +import org.apache.spark.scheduler._ +import org.apache.spark.ui.jobs.{JobProgressListener, StagePage, StagesTab} +import org.apache.spark.ui.scope.RDDOperationGraphListener + +class StagePageSuite extends SparkFunSuite with LocalSparkContext { + + test("peak execution memory only displayed if unsafe is enabled") { + val unsafeConf = "spark.sql.unsafe.enabled" + val conf = new SparkConf(false).set(unsafeConf, "true") + val html = renderStagePage(conf).toString().toLowerCase + val targetString = "peak execution memory" + assert(html.contains(targetString)) + // Disable unsafe and make sure it's not there + val conf2 = new SparkConf(false).set(unsafeConf, "false") + val html2 = renderStagePage(conf2).toString().toLowerCase + assert(!html2.contains(targetString)) + // Avoid setting anything; it should be displayed by default + val conf3 = new SparkConf(false) + val html3 = renderStagePage(conf3).toString().toLowerCase + assert(html3.contains(targetString)) + } + + test("SPARK-10543: peak execution memory should be per-task rather than cumulative") { + val unsafeConf = "spark.sql.unsafe.enabled" + val conf = new SparkConf(false).set(unsafeConf, "true") + val html = renderStagePage(conf).toString().toLowerCase + // verify min/25/50/75/max show task value not cumulative values + assert(html.contains("10.0 b" * 5)) + } + + /** + * Render a stage page started with the given conf and return the HTML. + * This also runs a dummy stage to populate the page with useful content. + */ + private def renderStagePage(conf: SparkConf): Seq[Node] = { + val jobListener = new JobProgressListener(conf) + val graphListener = new RDDOperationGraphListener(conf) + val tab = mock(classOf[StagesTab], RETURNS_SMART_NULLS) + val request = mock(classOf[HttpServletRequest]) + when(tab.conf).thenReturn(conf) + when(tab.progressListener).thenReturn(jobListener) + when(tab.operationGraphListener).thenReturn(graphListener) + when(tab.appName).thenReturn("testing") + when(tab.headerTabs).thenReturn(Seq.empty) + when(request.getParameter("id")).thenReturn("0") + when(request.getParameter("attempt")).thenReturn("0") + val page = new StagePage(tab) + + // Simulate a stage in job progress listener + val stageInfo = new StageInfo(0, 0, "dummy", 1, Seq.empty, Seq.empty, "details") + // Simulate two tasks to test PEAK_EXECUTION_MEMORY correctness + (1 to 2).foreach { + taskId => + val taskInfo = new TaskInfo(taskId, taskId, 0, 0, "0", "localhost", TaskLocality.ANY, false) + val peakExecutionMemory = 10 + taskInfo.accumulables += new AccumulableInfo(0, InternalAccumulator.PEAK_EXECUTION_MEMORY, + Some(peakExecutionMemory.toString), (peakExecutionMemory * taskId).toString, true) + jobListener.onStageSubmitted(SparkListenerStageSubmitted(stageInfo)) + jobListener.onTaskStart(SparkListenerTaskStart(0, 0, taskInfo)) + taskInfo.markSuccessful() + jobListener.onTaskEnd( + SparkListenerTaskEnd(0, 0, "result", Success, taskInfo, TaskMetrics.empty)) + } + jobListener.onStageCompleted(SparkListenerStageCompleted(stageInfo)) + page.render(request) + } + +} diff --git a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala index 56f7b9cf1f35..b140387d309f 100644 --- a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala @@ -240,7 +240,7 @@ class JobProgressListenerSuite extends SparkFunSuite with LocalSparkContext with val taskFailedReasons = Seq( Resubmitted, new FetchFailed(null, 0, 0, 0, "ignored"), - ExceptionFailure("Exception", "description", null, null, None), + ExceptionFailure("Exception", "description", null, null, None, None), TaskResultLost, TaskKilled, ExecutorLostFailure("0"), diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala index dde95f377843..343a4139b0ca 100644 --- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala @@ -163,7 +163,8 @@ class JsonProtocolSuite extends SparkFunSuite { } test("ExceptionFailure backward compatibility") { - val exceptionFailure = ExceptionFailure("To be", "or not to be", stackTrace, null, None) + val exceptionFailure = ExceptionFailure("To be", "or not to be", stackTrace, null, + None, None) val oldEvent = JsonProtocol.taskEndReasonToJson(exceptionFailure) .removeField({ _._1 == "Full Stack Trace" }) assertEquals(exceptionFailure, JsonProtocol.taskEndReasonFromJson(oldEvent)) diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 8f7e402d5f2a..1fb81ad565b4 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -720,4 +720,18 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(Utils.nanSafeCompareFloats(Float.PositiveInfinity, Float.NaN) === -1) assert(Utils.nanSafeCompareFloats(Float.NegativeInfinity, Float.NaN) === -1) } + + test("isDynamicAllocationEnabled") { + val conf = new SparkConf() + assert(Utils.isDynamicAllocationEnabled(conf) === false) + assert(Utils.isDynamicAllocationEnabled( + conf.set("spark.dynamicAllocation.enabled", "false")) === false) + assert(Utils.isDynamicAllocationEnabled( + conf.set("spark.dynamicAllocation.enabled", "true")) === true) + assert(Utils.isDynamicAllocationEnabled( + conf.set("spark.executor.instances", "1")) === false) + assert(Utils.isDynamicAllocationEnabled( + conf.set("spark.executor.instances", "0")) === true) + } + } diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala index 9c362f0de707..12e9bafcc92c 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala @@ -399,4 +399,19 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext { sc.stop() } + test("external aggregation updates peak execution memory") { + val conf = createSparkConf(loadDefaults = false) + .set("spark.shuffle.memoryFraction", "0.001") + .set("spark.shuffle.manager", "hash") // make sure we're not also using ExternalSorter + sc = new SparkContext("local", "test", conf) + // No spilling + AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external map without spilling") { + sc.parallelize(1 to 10, 2).map { i => (i, i) }.reduceByKey(_ + _).count() + } + // With spilling + AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external map with spilling") { + sc.parallelize(1 to 1000 * 1000, 2).map { i => (i, i) }.reduceByKey(_ + _).count() + } + } + } diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala index 986cd8623d14..bdb0f4d507a7 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala @@ -692,7 +692,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext { sortWithoutBreakingSortingContracts(createSparkConf(true, false)) } - def sortWithoutBreakingSortingContracts(conf: SparkConf) { + private def sortWithoutBreakingSortingContracts(conf: SparkConf) { conf.set("spark.shuffle.memoryFraction", "0.01") conf.set("spark.shuffle.manager", "sort") sc = new SparkContext("local-cluster[1,1,1024]", "test", conf) @@ -743,5 +743,15 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext { } sorter2.stop() - } + } + + test("sorting updates peak execution memory") { + val conf = createSparkConf(loadDefaults = false, kryo = false) + .set("spark.shuffle.manager", "sort") + sc = new SparkContext("local", "test", conf) + // Avoid aggregating here to make sure we're not also using ExternalAppendOnlyMap + AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external sorter") { + sc.parallelize(1 to 1000, 2).repartition(100).count() + } + } } diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala index 26a2e96edaaa..0326ed70b5ed 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala @@ -55,6 +55,44 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks { forAll { (s1: String, s2: String) => testPrefixComparison(s1, s2) } } + test("Binary prefix comparator") { + + def compareBinary(x: Array[Byte], y: Array[Byte]): Int = { + for (i <- 0 until x.length; if i < y.length) { + val res = x(i).compare(y(i)) + if (res != 0) return res + } + x.length - y.length + } + + def testPrefixComparison(x: Array[Byte], y: Array[Byte]): Unit = { + val s1Prefix = PrefixComparators.BinaryPrefixComparator.computePrefix(x) + val s2Prefix = PrefixComparators.BinaryPrefixComparator.computePrefix(y) + val prefixComparisonResult = + PrefixComparators.BINARY.compare(s1Prefix, s2Prefix) + assert( + (prefixComparisonResult == 0) || + (prefixComparisonResult < 0 && compareBinary(x, y) < 0) || + (prefixComparisonResult > 0 && compareBinary(x, y) > 0)) + } + + // scalastyle:off + val regressionTests = Table( + ("s1", "s2"), + ("abc", "世界"), + ("你好", "世界"), + ("你好123", "你好122") + ) + // scalastyle:on + + forAll (regressionTests) { (s1: String, s2: String) => + testPrefixComparison(s1.getBytes("UTF-8"), s2.getBytes("UTF-8")) + } + forAll { (s1: String, s2: String) => + testPrefixComparison(s1.getBytes("UTF-8"), s2.getBytes("UTF-8")) + } + } + test("double prefix comparator handles NaNs properly") { val nan1: Double = java.lang.Double.longBitsToDouble(0x7ff0000000000001L) val nan2: Double = java.lang.Double.longBitsToDouble(0x7fffffffffffffffL) diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh deleted file mode 100755 index 4311c8c9e4ca..000000000000 --- a/dev/create-release/create-release.sh +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Quick-and-dirty automation of making maven and binary releases. Not robust at all. -# Publishes releases to Maven and packages/copies binary release artifacts. -# Expects to be run in a totally empty directory. -# -# Options: -# --skip-create-release Assume the desired release tag already exists -# --skip-publish Do not publish to Maven central -# --skip-package Do not package and upload binary artifacts -# Would be nice to add: -# - Send output to stderr and have useful logging in stdout - -# Note: The following variables must be set before use! -ASF_USERNAME=${ASF_USERNAME:-pwendell} -ASF_PASSWORD=${ASF_PASSWORD:-XXX} -GPG_PASSPHRASE=${GPG_PASSPHRASE:-XXX} -GIT_BRANCH=${GIT_BRANCH:-branch-1.0} -RELEASE_VERSION=${RELEASE_VERSION:-1.2.0} -# Allows publishing under a different version identifier than -# was present in the actual release sources (e.g. rc-X) -PUBLISH_VERSION=${PUBLISH_VERSION:-$RELEASE_VERSION} -NEXT_VERSION=${NEXT_VERSION:-1.2.1} -RC_NAME=${RC_NAME:-rc2} - -M2_REPO=~/.m2/repository -SPARK_REPO=$M2_REPO/org/apache/spark -NEXUS_ROOT=https://repository.apache.org/service/local/staging -NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads - -if [ -z "$JAVA_HOME" ]; then - echo "Error: JAVA_HOME is not set, cannot proceed." - exit -1 -fi -JAVA_7_HOME=${JAVA_7_HOME:-$JAVA_HOME} - -set -e - -GIT_TAG=v$RELEASE_VERSION-$RC_NAME - -if [[ ! "$@" =~ --skip-create-release ]]; then - echo "Creating release commit and publishing to Apache repository" - # Artifact publishing - git clone https://$ASF_USERNAME:$ASF_PASSWORD@git-wip-us.apache.org/repos/asf/spark.git \ - -b $GIT_BRANCH - pushd spark - export MAVEN_OPTS="-Xmx3g -XX:MaxPermSize=1g -XX:ReservedCodeCacheSize=1g" - - # Create release commits and push them to github - # NOTE: This is done "eagerly" i.e. we don't check if we can succesfully build - # or before we coin the release commit. This helps avoid races where - # other people add commits to this branch while we are in the middle of building. - cur_ver="${RELEASE_VERSION}-SNAPSHOT" - rel_ver="${RELEASE_VERSION}" - next_ver="${NEXT_VERSION}-SNAPSHOT" - - old="^\( \{2,4\}\)${cur_ver}<\/version>$" - new="\1${rel_ver}<\/version>" - find . -name pom.xml | grep -v dev | xargs -I {} sed -i \ - -e "s/${old}/${new}/" {} - find . -name package.scala | grep -v dev | xargs -I {} sed -i \ - -e "s/${old}/${new}/" {} - - git commit -a -m "Preparing Spark release $GIT_TAG" - echo "Creating tag $GIT_TAG at the head of $GIT_BRANCH" - git tag $GIT_TAG - - old="^\( \{2,4\}\)${rel_ver}<\/version>$" - new="\1${next_ver}<\/version>" - find . -name pom.xml | grep -v dev | xargs -I {} sed -i \ - -e "s/$old/$new/" {} - find . -name package.scala | grep -v dev | xargs -I {} sed -i \ - -e "s/${old}/${new}/" {} - git commit -a -m "Preparing development version $next_ver" - git push origin $GIT_TAG - git push origin HEAD:$GIT_BRANCH - popd - rm -rf spark -fi - -if [[ ! "$@" =~ --skip-publish ]]; then - git clone https://$ASF_USERNAME:$ASF_PASSWORD@git-wip-us.apache.org/repos/asf/spark.git - pushd spark - git checkout --force $GIT_TAG - - # Substitute in case published version is different than released - old="^\( \{2,4\}\)${RELEASE_VERSION}<\/version>$" - new="\1${PUBLISH_VERSION}<\/version>" - find . -name pom.xml | grep -v dev | xargs -I {} sed -i \ - -e "s/${old}/${new}/" {} - - # Using Nexus API documented here: - # https://support.sonatype.com/entries/39720203-Uploading-to-a-Staging-Repository-via-REST-API - echo "Creating Nexus staging repository" - repo_request="Apache Spark $GIT_TAG (published as $PUBLISH_VERSION)" - out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \ - -H "Content-Type:application/xml" -v \ - $NEXUS_ROOT/profiles/$NEXUS_PROFILE/start) - staged_repo_id=$(echo $out | sed -e "s/.*\(orgapachespark-[0-9]\{4\}\).*/\1/") - echo "Created Nexus staging repository: $staged_repo_id" - - rm -rf $SPARK_REPO - - build/mvn -DskipTests -Pyarn -Phive \ - -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \ - clean install - - ./dev/change-scala-version.sh 2.11 - - build/mvn -DskipTests -Pyarn -Phive \ - -Dscala-2.11 -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \ - clean install - - ./dev/change-scala-version.sh 2.10 - - pushd $SPARK_REPO - - # Remove any extra files generated during install - find . -type f |grep -v \.jar |grep -v \.pom | xargs rm - - echo "Creating hash and signature files" - for file in $(find . -type f) - do - echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --output $file.asc --detach-sig --armour $file; - if [ $(command -v md5) ]; then - # Available on OS X; -q to keep only hash - md5 -q $file > $file.md5 - else - # Available on Linux; cut to keep only hash - md5sum $file | cut -f1 -d' ' > $file.md5 - fi - shasum -a 1 $file | cut -f1 -d' ' > $file.sha1 - done - - nexus_upload=$NEXUS_ROOT/deployByRepositoryId/$staged_repo_id - echo "Uplading files to $nexus_upload" - for file in $(find . -type f) - do - # strip leading ./ - file_short=$(echo $file | sed -e "s/\.\///") - dest_url="$nexus_upload/org/apache/spark/$file_short" - echo " Uploading $file_short" - curl -u $ASF_USERNAME:$ASF_PASSWORD --upload-file $file_short $dest_url - done - - echo "Closing nexus staging repository" - repo_request="$staged_repo_idApache Spark $GIT_TAG (published as $PUBLISH_VERSION)" - out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \ - -H "Content-Type:application/xml" -v \ - $NEXUS_ROOT/profiles/$NEXUS_PROFILE/finish) - echo "Closed Nexus staging repository: $staged_repo_id" - - popd - popd - rm -rf spark -fi - -if [[ ! "$@" =~ --skip-package ]]; then - # Source and binary tarballs - echo "Packaging release tarballs" - git clone https://git-wip-us.apache.org/repos/asf/spark.git - cd spark - git checkout --force $GIT_TAG - release_hash=`git rev-parse HEAD` - - rm .gitignore - rm -rf .git - cd .. - - cp -r spark spark-$RELEASE_VERSION - tar cvzf spark-$RELEASE_VERSION.tgz spark-$RELEASE_VERSION - echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --armour --output spark-$RELEASE_VERSION.tgz.asc \ - --detach-sig spark-$RELEASE_VERSION.tgz - echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --print-md MD5 spark-$RELEASE_VERSION.tgz > \ - spark-$RELEASE_VERSION.tgz.md5 - echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --print-md SHA512 spark-$RELEASE_VERSION.tgz > \ - spark-$RELEASE_VERSION.tgz.sha - rm -rf spark-$RELEASE_VERSION - - # Updated for each binary build - make_binary_release() { - NAME=$1 - FLAGS=$2 - ZINC_PORT=$3 - cp -r spark spark-$RELEASE_VERSION-bin-$NAME - - cd spark-$RELEASE_VERSION-bin-$NAME - - # TODO There should probably be a flag to make-distribution to allow 2.11 support - if [[ $FLAGS == *scala-2.11* ]]; then - ./dev/change-scala-version.sh 2.11 - fi - - export ZINC_PORT=$ZINC_PORT - echo "Creating distribution: $NAME ($FLAGS)" - ./make-distribution.sh --name $NAME --tgz $FLAGS -DzincPort=$ZINC_PORT 2>&1 > \ - ../binary-release-$NAME.log - cd .. - cp spark-$RELEASE_VERSION-bin-$NAME/spark-$RELEASE_VERSION-bin-$NAME.tgz . - - echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --armour \ - --output spark-$RELEASE_VERSION-bin-$NAME.tgz.asc \ - --detach-sig spark-$RELEASE_VERSION-bin-$NAME.tgz - echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --print-md \ - MD5 spark-$RELEASE_VERSION-bin-$NAME.tgz > \ - spark-$RELEASE_VERSION-bin-$NAME.tgz.md5 - echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --print-md \ - SHA512 spark-$RELEASE_VERSION-bin-$NAME.tgz > \ - spark-$RELEASE_VERSION-bin-$NAME.tgz.sha - } - - # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds - # share the same Zinc server. - make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" & - make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" & - make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" & - make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" & - make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" & - make_binary_release "mapr3" "-Pmapr3 -Psparkr -Phive -Phive-thriftserver" "3035" & - make_binary_release "mapr4" "-Pmapr4 -Psparkr -Pyarn -Phive -Phive-thriftserver" "3036" & - make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn" "3037" & - wait - rm -rf spark-$RELEASE_VERSION-bin-*/ - - # Copy data - echo "Copying release tarballs" - rc_folder=spark-$RELEASE_VERSION-$RC_NAME - ssh $ASF_USERNAME@people.apache.org \ - mkdir /home/$ASF_USERNAME/public_html/$rc_folder - scp spark-* \ - $ASF_USERNAME@people.apache.org:/home/$ASF_USERNAME/public_html/$rc_folder/ - - # Docs - cd spark - sbt/sbt clean - cd docs - # Compile docs with Java 7 to use nicer format - JAVA_HOME="$JAVA_7_HOME" PRODUCTION=1 RELEASE_VERSION="$RELEASE_VERSION" jekyll build - echo "Copying release documentation" - rc_docs_folder=${rc_folder}-docs - ssh $ASF_USERNAME@people.apache.org \ - mkdir /home/$ASF_USERNAME/public_html/$rc_docs_folder - rsync -r _site/* $ASF_USERNAME@people.apache.org:/home/$ASF_USERNAME/public_html/$rc_docs_folder - - echo "Release $RELEASE_VERSION completed:" - echo "Git tag:\t $GIT_TAG" - echo "Release commit:\t $release_hash" - echo "Binary location:\t http://people.apache.org/~$ASF_USERNAME/$rc_folder" - echo "Doc location:\t http://people.apache.org/~$ASF_USERNAME/$rc_docs_folder" -fi diff --git a/dev/create-release/generate-changelist.py b/dev/create-release/generate-changelist.py index 2e1a35a62934..37e5651b6774 100755 --- a/dev/create-release/generate-changelist.py +++ b/dev/create-release/generate-changelist.py @@ -31,8 +31,8 @@ import traceback SPARK_HOME = os.environ["SPARK_HOME"] -NEW_RELEASE_VERSION = "1.0.0" -PREV_RELEASE_GIT_TAG = "v0.9.1" +NEW_RELEASE_VERSION = "1.5.0" +PREV_RELEASE_GIT_TAG = "v1.4.0" CHANGELIST = "CHANGES.txt" OLD_CHANGELIST = "%s.old" % (CHANGELIST) diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh new file mode 100755 index 000000000000..d0b3a54dde1d --- /dev/null +++ b/dev/create-release/release-build.sh @@ -0,0 +1,321 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +function exit_with_usage { + cat << EOF +usage: release-build.sh +Creates build deliverables from a Spark commit. + +Top level targets are + package: Create binary packages and copy them to people.apache + docs: Build docs and copy them to people.apache + publish-snapshot: Publish snapshot release to Apache snapshots + publish-release: Publish a release to Apache release repo + +All other inputs are environment variables + +GIT_REF - Release tag or commit to build from +SPARK_VERSION - Release identifier used when publishing +SPARK_PACKAGE_VERSION - Release identifier in top level package directory +REMOTE_PARENT_DIR - Parent in which to create doc or release builds. +REMOTE_PARENT_MAX_LENGTH - If set, parent directory will be cleaned to only + have this number of subdirectories (by deleting old ones). WARNING: This deletes data. + +ASF_USERNAME - Username of ASF committer account +ASF_PASSWORD - Password of ASF committer account +ASF_RSA_KEY - RSA private key file for ASF committer account + +GPG_KEY - GPG key used to sign release artifacts +GPG_PASSPHRASE - Passphrase for GPG key +EOF + exit 1 +} + +set -e + +if [ $# -eq 0 ]; then + exit_with_usage +fi + +if [[ $@ == *"help"* ]]; then + exit_with_usage +fi + +for env in ASF_USERNAME ASF_RSA_KEY GPG_PASSPHRASE GPG_KEY; do + if [ -z "${!env}" ]; then + echo "ERROR: $env must be set to run this script" + exit_with_usage + fi +done + +# Commit ref to checkout when building +GIT_REF=${GIT_REF:-master} + +# Destination directory parent on remote server +REMOTE_PARENT_DIR=${REMOTE_PARENT_DIR:-/home/$ASF_USERNAME/public_html} + +SSH="ssh -o StrictHostKeyChecking=no -i $ASF_RSA_KEY" +GPG="gpg --no-tty --batch" +NEXUS_ROOT=https://repository.apache.org/service/local/staging +NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads +BASE_DIR=$(pwd) + +MVN="build/mvn --force" +PUBLISH_PROFILES="-Pyarn -Phive -Phadoop-2.2" +PUBLISH_PROFILES="$PUBLISH_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl" + +rm -rf spark +git clone https://git-wip-us.apache.org/repos/asf/spark.git +cd spark +git checkout $GIT_REF +git_hash=`git rev-parse --short HEAD` +echo "Checked out Spark git hash $git_hash" + +if [ -z "$SPARK_VERSION" ]; then + SPARK_VERSION=$($MVN help:evaluate -Dexpression=project.version \ + | grep -v INFO | grep -v WARNING | grep -v Download) +fi + +if [ -z "$SPARK_PACKAGE_VERSION" ]; then + SPARK_PACKAGE_VERSION="${SPARK_VERSION}-$(date +%Y_%m_%d_%H_%M)-${git_hash}" +fi + +DEST_DIR_NAME="spark-$SPARK_PACKAGE_VERSION" +USER_HOST="$ASF_USERNAME@people.apache.org" + +rm .gitignore +rm -rf .git +cd .. + +if [ -n "$REMOTE_PARENT_MAX_LENGTH" ]; then + old_dirs=$($SSH $USER_HOST ls -t $REMOTE_PARENT_DIR | tail -n +$REMOTE_PARENT_MAX_LENGTH) + for old_dir in $old_dirs; do + echo "Removing directory: $old_dir" + $SSH $USER_HOST rm -r $REMOTE_PARENT_DIR/$old_dir + done +fi + +if [[ "$1" == "package" ]]; then + # Source and binary tarballs + echo "Packaging release tarballs" + cp -r spark spark-$SPARK_VERSION + tar cvzf spark-$SPARK_VERSION.tgz spark-$SPARK_VERSION + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour --output spark-$SPARK_VERSION.tgz.asc \ + --detach-sig spark-$SPARK_VERSION.tgz + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md MD5 spark-$SPARK_VERSION.tgz > \ + spark-$SPARK_VERSION.tgz.md5 + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ + SHA512 spark-$SPARK_VERSION.tgz > spark-$SPARK_VERSION.tgz.sha + rm -rf spark-$SPARK_VERSION + + # Updated for each binary build + make_binary_release() { + NAME=$1 + FLAGS=$2 + ZINC_PORT=$3 + cp -r spark spark-$SPARK_VERSION-bin-$NAME + + cd spark-$SPARK_VERSION-bin-$NAME + + # TODO There should probably be a flag to make-distribution to allow 2.11 support + if [[ $FLAGS == *scala-2.11* ]]; then + ./dev/change-scala-version.sh 2.11 + fi + + export ZINC_PORT=$ZINC_PORT + echo "Creating distribution: $NAME ($FLAGS)" + ./make-distribution.sh --name $NAME --tgz $FLAGS -DzincPort=$ZINC_PORT 2>&1 > \ + ../binary-release-$NAME.log + cd .. + cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz . + + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \ + --output spark-$SPARK_VERSION-bin-$NAME.tgz.asc \ + --detach-sig spark-$SPARK_VERSION-bin-$NAME.tgz + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ + MD5 spark-$SPARK_VERSION-bin-$NAME.tgz > \ + spark-$SPARK_VERSION-bin-$NAME.tgz.md5 + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ + SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \ + spark-$SPARK_VERSION-bin-$NAME.tgz.sha + } + + # TODO: Check exit codes of children here: + # http://stackoverflow.com/questions/1570262/shell-get-exit-code-of-background-process + + # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds + # share the same Zinc server. + make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" & + make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" & + make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" & + make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" & + make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" & + make_binary_release "hadoop2.6" "-Psparkr -Phadoop-2.6 -Phive -Phive-thriftserver -Pyarn" "3034" & + make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn" "3037" & + make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn" "3038" & + wait + rm -rf spark-$SPARK_VERSION-bin-*/ + + # Copy data + dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-bin" + echo "Copying release tarballs to $dest_dir" + $SSH $USER_HOST mkdir $dest_dir + rsync -e "$SSH" spark-* $USER_HOST:$dest_dir + echo "Linking /latest to $dest_dir" + $SSH $USER_HOST rm -f "$REMOTE_PARENT_DIR/latest" + $SSH $USER_HOST ln -s $dest_dir "$REMOTE_PARENT_DIR/latest" + exit 0 +fi + +if [[ "$1" == "docs" ]]; then + # Documentation + cd spark + echo "Building Spark docs" + dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-docs" + cd docs + # Compile docs with Java 7 to use nicer format + # TODO: Make configurable to add this: PRODUCTION=1 + PRODUCTION=1 RELEASE_VERSION="$SPARK_VERSION" jekyll build + echo "Copying release documentation to $dest_dir" + $SSH $USER_HOST mkdir $dest_dir + echo "Linking /latest to $dest_dir" + $SSH $USER_HOST rm -f "$REMOTE_PARENT_DIR/latest" + $SSH $USER_HOST ln -s $dest_dir "$REMOTE_PARENT_DIR/latest" + rsync -e "$SSH" -r _site/* $USER_HOST:$dest_dir + cd .. + exit 0 +fi + +if [[ "$1" == "publish-snapshot" ]]; then + cd spark + # Publish Spark to Maven release repo + echo "Deploying Spark SNAPSHOT at '$GIT_REF' ($git_hash)" + echo "Publish version is $SPARK_VERSION" + if [[ ! $SPARK_VERSION == *"SNAPSHOT"* ]]; then + echo "ERROR: Snapshots must have a version containing SNAPSHOT" + echo "ERROR: You gave version '$SPARK_VERSION'" + exit 1 + fi + # Coerce the requested version + $MVN versions:set -DnewVersion=$SPARK_VERSION + tmp_settings="tmp-settings.xml" + echo "" > $tmp_settings + echo "apache.snapshots.https$ASF_USERNAME" >> $tmp_settings + echo "$ASF_PASSWORD" >> $tmp_settings + echo "" >> $tmp_settings + + # Generate random point for Zinc + export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") + + $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES \ + -Phive-thriftserver deploy + ./dev/change-scala-version.sh 2.11 + $MVN -DzincPort=$ZINC_PORT -Dscala-2.11 --settings $tmp_settings \ + -DskipTests $PUBLISH_PROFILES clean deploy + + # Clean-up Zinc nailgun process + /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill + + rm $tmp_settings + cd .. + exit 0 +fi + +if [[ "$1" == "publish-release" ]]; then + cd spark + # Publish Spark to Maven release repo + echo "Publishing Spark checkout at '$GIT_REF' ($git_hash)" + echo "Publish version is $SPARK_VERSION" + # Coerce the requested version + $MVN versions:set -DnewVersion=$SPARK_VERSION + + # Using Nexus API documented here: + # https://support.sonatype.com/entries/39720203-Uploading-to-a-Staging-Repository-via-REST-API + echo "Creating Nexus staging repository" + repo_request="Apache Spark $SPARK_VERSION (commit $git_hash)" + out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \ + -H "Content-Type:application/xml" -v \ + $NEXUS_ROOT/profiles/$NEXUS_PROFILE/start) + staged_repo_id=$(echo $out | sed -e "s/.*\(orgapachespark-[0-9]\{4\}\).*/\1/") + echo "Created Nexus staging repository: $staged_repo_id" + + tmp_repo=$(mktemp -d spark-repo-XXXXX) + + # Generate random point for Zinc + export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") + + $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES \ + -Phive-thriftserver clean install + + ./dev/change-scala-version.sh 2.11 + + $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -Dscala-2.11 \ + -DskipTests $PUBLISH_PROFILES clean install + + # Clean-up Zinc nailgun process + /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill + + ./dev/change-version-to-2.10.sh + + pushd $tmp_repo/org/apache/spark + + # Remove any extra files generated during install + find . -type f |grep -v \.jar |grep -v \.pom | xargs rm + + echo "Creating hash and signature files" + for file in $(find . -type f) + do + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --output $file.asc \ + --detach-sig --armour $file; + if [ $(command -v md5) ]; then + # Available on OS X; -q to keep only hash + md5 -q $file > $file.md5 + else + # Available on Linux; cut to keep only hash + md5sum $file | cut -f1 -d' ' > $file.md5 + fi + sha1sum $file | cut -f1 -d' ' > $file.sha1 + done + + nexus_upload=$NEXUS_ROOT/deployByRepositoryId/$staged_repo_id + echo "Uplading files to $nexus_upload" + for file in $(find . -type f) + do + # strip leading ./ + file_short=$(echo $file | sed -e "s/\.\///") + dest_url="$nexus_upload/org/apache/spark/$file_short" + echo " Uploading $file_short" + curl -u $ASF_USERNAME:$ASF_PASSWORD --upload-file $file_short $dest_url + done + + echo "Closing nexus staging repository" + repo_request="$staged_repo_idApache Spark $SPARK_VERSION (commit $git_hash)" + out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \ + -H "Content-Type:application/xml" -v \ + $NEXUS_ROOT/profiles/$NEXUS_PROFILE/finish) + echo "Closed Nexus staging repository: $staged_repo_id" + popd + rm -rf $tmp_repo + cd .. + exit 0 +fi + +cd .. +rm -rf spark +echo "ERROR: expects to be called with 'package', 'docs', 'publish-release' or 'publish-snapshot'" diff --git a/dev/create-release/release-tag.sh b/dev/create-release/release-tag.sh new file mode 100755 index 000000000000..b0a3374becc6 --- /dev/null +++ b/dev/create-release/release-tag.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +function exit_with_usage { + cat << EOF +usage: tag-release.sh +Tags a Spark release on a particular branch. + +Inputs are specified with the following environment variables: +ASF_USERNAME - Apache Username +ASF_PASSWORD - Apache Password +GIT_NAME - Name to use with git +GIT_EMAIL - E-mail address to use with git +GIT_BRANCH - Git branch on which to make release +RELEASE_VERSION - Version used in pom files for release +RELEASE_TAG - Name of release tag +NEXT_VERSION - Development version after release +EOF + exit 1 +} + +set -e + +if [[ $@ == *"help"* ]]; then + exit_with_usage +fi + +for env in ASF_USERNAME ASF_PASSWORD RELEASE_VERSION RELEASE_TAG NEXT_VERSION GIT_EMAIL GIT_NAME GIT_BRANCH; do + if [ -z "${!env}" ]; then + echo "$env must be set to run this script" + exit 1 + fi +done + +ASF_SPARK_REPO="git-wip-us.apache.org/repos/asf/spark.git" +MVN="build/mvn --force" + +rm -rf spark +git clone https://$ASF_USERNAME:$ASF_PASSWORD@$ASF_SPARK_REPO -b $GIT_BRANCH +cd spark + +git config user.name "$GIT_NAME" +git config user.email $GIT_EMAIL + +# Create release version +$MVN versions:set -DnewVersion=$RELEASE_VERSION | grep -v "no value" # silence logs +git commit -a -m "Preparing Spark release $RELEASE_TAG" +echo "Creating tag $RELEASE_TAG at the head of $GIT_BRANCH" +git tag $RELEASE_TAG + +# TODO: It would be nice to do some verifications here +# i.e. check whether ec2 scripts have the new version + +# Create next version +$MVN versions:set -DnewVersion=$NEXT_VERSION | grep -v "no value" # silence logs +git commit -a -m "Preparing development version $NEXT_VERSION" + +# Push changes +git push origin $RELEASE_TAG +git push origin HEAD:$GIT_BRANCH + +cd .. +rm -rf spark diff --git a/dev/run-tests.py b/dev/run-tests.py index b6d181418f02..f689425ee40b 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -273,6 +273,7 @@ def get_hadoop_profiles(hadoop_version): "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"], "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], + "hadoop2.6": ["-Pyarn", "-Phadoop-2.6"], } if hadoop_version in sbt_maven_hadoop_profiles: @@ -289,7 +290,7 @@ def build_spark_maven(hadoop_version): mvn_goals = ["clean", "package", "-DskipTests"] profiles_and_goals = build_profiles + mvn_goals - print("[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments: ", + print("[info] Building Spark (w/Hive 1.2.1) using Maven with these arguments: ", " ".join(profiles_and_goals)) exec_maven(profiles_and_goals) @@ -302,17 +303,19 @@ def build_spark_sbt(hadoop_version): "assembly/assembly", "streaming-kafka-assembly/assembly", "streaming-flume-assembly/assembly", + "streaming-mqtt-assembly/assembly", + "streaming-mqtt/test:assembly", "streaming-kinesis-asl-assembly/assembly"] profiles_and_goals = build_profiles + sbt_goals - print("[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments: ", + print("[info] Building Spark (w/Hive 1.2.1) using SBT with these arguments: ", " ".join(profiles_and_goals)) exec_sbt(profiles_and_goals) def build_apache_spark(build_tool, hadoop_version): - """Will build Spark against Hive v0.13.1 given the passed in build tool (either `sbt` or + """Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or `maven`). Defaults to using `sbt`.""" set_title_and_block("Building Spark", "BLOCK_BUILD") diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 956dc81b62e9..346452f3174e 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -134,7 +134,7 @@ def contains_file(self, filename): # files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't # fail other PRs. streaming_kinesis_asl = Module( - name="kinesis-asl", + name="streaming-kinesis-asl", dependencies=[], source_file_regexes=[ "extras/kinesis-asl/", @@ -147,7 +147,7 @@ def contains_file(self, filename): "ENABLE_KINESIS_TESTS": "1" }, sbt_test_goals=[ - "kinesis-asl/test", + "streaming-kinesis-asl/test", ] ) @@ -181,6 +181,7 @@ def contains_file(self, filename): dependencies=[streaming], source_file_regexes=[ "external/mqtt", + "external/mqtt-assembly", ], sbt_test_goals=[ "streaming-mqtt/test", @@ -306,6 +307,7 @@ def contains_file(self, filename): streaming, streaming_kafka, streaming_flume_assembly, + streaming_mqtt, streaming_kinesis_asl ], source_file_regexes=[ @@ -331,6 +333,7 @@ def contains_file(self, filename): "pyspark.mllib.feature", "pyspark.mllib.fpm", "pyspark.mllib.linalg.__init__", + "pyspark.mllib.linalg.distributed", "pyspark.mllib.random", "pyspark.mllib.recommendation", "pyspark.mllib.regression", diff --git a/docker/spark-mesos/Dockerfile b/docker/spark-mesos/Dockerfile index b90aef3655de..fb3f267fe5c7 100644 --- a/docker/spark-mesos/Dockerfile +++ b/docker/spark-mesos/Dockerfile @@ -24,7 +24,7 @@ RUN apt-get update && \ apt-get install -y python libnss3 openjdk-7-jre-headless curl RUN mkdir /opt/spark && \ - curl http://www.apache.org/dyn/closer.cgi/spark/spark-1.4.0/spark-1.4.0-bin-hadoop2.4.tgz \ + curl http://www.apache.org/dyn/closer.lua/spark/spark-1.4.0/spark-1.4.0-bin-hadoop2.4.tgz \ | tar -xzC /opt ENV SPARK_HOME /opt/spark ENV MESOS_NATIVE_JAVA_LIBRARY /usr/local/lib/libmesos.so diff --git a/docs/README.md b/docs/README.md index d7652e921f7d..1f4fd3e56ed5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -8,6 +8,17 @@ Read on to learn more about viewing documentation in plain text (i.e., markdown) documentation yourself. Why build it yourself? So that you have the docs that corresponds to whichever version of Spark you currently have checked out of revision control. +## Prerequisites +The Spark documentation build uses a number of tools to build HTML docs and API docs in Scala, +Python and R. To get started you can run the following commands + + $ sudo gem install jekyll + $ sudo gem install jekyll-redirect-from + $ sudo pip install Pygments + $ sudo pip install sphinx + $ Rscript -e 'install.packages(c("knitr", "devtools"), repos="http://cran.stat.ucla.edu/")' + + ## Generating the Documentation HTML We include the Spark documentation as part of the source (as opposed to using a hosted wiki, such as @@ -19,17 +30,12 @@ you have checked out or downloaded. In this directory you will find textfiles formatted using Markdown, with an ".md" suffix. You can read those text files directly if you want. Start with index.md. -The markdown code can be compiled to HTML using the [Jekyll tool](http://jekyllrb.com). -`Jekyll` and a few dependencies must be installed for this to work. We recommend -installing via the Ruby Gem dependency manager. Since the exact HTML output -varies between versions of Jekyll and its dependencies, we list specific versions here -in some cases: - - $ sudo gem install jekyll - $ sudo gem install jekyll-redirect-from +Execute `jekyll build` from the `docs/` directory to compile the site. Compiling the site with +Jekyll will create a directory called `_site` containing index.html as well as the rest of the +compiled files. -Execute `jekyll build` from the `docs/` directory to compile the site. Compiling the site with Jekyll will create a directory -called `_site` containing index.html as well as the rest of the compiled files. + $ cd docs + $ jekyll build You can modify the default Jekyll build as follows: @@ -40,29 +46,6 @@ You can modify the default Jekyll build as follows: # Build the site with extra features used on the live page $ PRODUCTION=1 jekyll build -## Pygments - -We also use pygments (http://pygments.org) for syntax highlighting in documentation markdown pages, -so you will also need to install that (it requires Python) by running `sudo pip install Pygments`. - -To mark a block of code in your markdown to be syntax highlighted by jekyll during the compile -phase, use the following sytax: - - {% highlight scala %} - // Your scala code goes here, you can replace scala with many other - // supported languages too. - {% endhighlight %} - -## Sphinx - -We use Sphinx to generate Python API docs, so you will need to install it by running -`sudo pip install sphinx`. - -## knitr, devtools - -SparkR documentation is written using `roxygen2` and we use `knitr`, `devtools` to generate -documentation. To install these packages you can run `install.packages(c("knitr", "devtools"))` from a -R console. ## API Docs (Scaladoc, Sphinx, roxygen2) diff --git a/docs/_config.yml b/docs/_config.yml index c0e031a83ba9..e3a447ea7353 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -14,7 +14,7 @@ include: # These allow the documentation to be updated with newer releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 1.5.0-SNAPSHOT +SPARK_VERSION: 1.5.0 SPARK_VERSION_SHORT: 1.5.0 SCALA_BINARY_VERSION: "2.10" SCALA_VERSION: "2.10.4" diff --git a/docs/building-spark.md b/docs/building-spark.md index a5da3b39502e..4db32cfd628b 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -7,7 +7,8 @@ redirect_from: "building-with-maven.html" * This will become a table of contents (this text will be scraped). {:toc} -Building Spark using Maven requires Maven 3.0.4 or newer and Java 7+. +Building Spark using Maven requires Maven 3.3.3 or newer and Java 7+. +The Spark build can supply a suitable Maven binary; see below. # Building with `build/mvn` @@ -60,12 +61,13 @@ If you don't run this, you may see errors like the following: You can fix this by setting the `MAVEN_OPTS` variable as discussed before. **Note:** -* *For Java 8 and above this step is not required.* -* *If using `build/mvn` and `MAVEN_OPTS` were not already set, the script will automate this for you.* + +* For Java 8 and above this step is not required. +* If using `build/mvn` with no `MAVEN_OPTS` set, the script will automate this for you. # Specifying the Hadoop Version -Because HDFS is not protocol-compatible across versions, if you want to read from HDFS, you'll need to build Spark against the specific HDFS version in your environment. You can do this through the "hadoop.version" property. If unset, Spark will build against Hadoop 2.2.0 by default. Note that certain build profiles are required for particular Hadoop versions: +Because HDFS is not protocol-compatible across versions, if you want to read from HDFS, you'll need to build Spark against the specific HDFS version in your environment. You can do this through the `hadoop.version` property. If unset, Spark will build against Hadoop 2.2.0 by default. Note that certain build profiles are required for particular Hadoop versions: @@ -90,7 +92,7 @@ mvn -Dhadoop.version=1.2.1 -Phadoop-1 -DskipTests clean package mvn -Dhadoop.version=2.0.0-mr1-cdh4.2.0 -Phadoop-1 -DskipTests clean package {% endhighlight %} -You can enable the "yarn" profile and optionally set the "yarn.version" property if it is different from "hadoop.version". Spark only supports YARN versions 2.2.0 and later. +You can enable the `yarn` profile and optionally set the `yarn.version` property if it is different from `hadoop.version`. Spark only supports YARN versions 2.2.0 and later. Examples: @@ -124,7 +126,7 @@ mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Phive -Phive-thriftserver -Dskip # Building for Scala 2.11 To produce a Spark package compiled with Scala 2.11, use the `-Dscala-2.11` property: - dev/change-scala-version.sh 2.11 + ./dev/change-scala-version.sh 2.11 mvn -Pyarn -Phadoop-2.4 -Dscala-2.11 -DskipTests clean package Spark does not yet support its JDBC component for Scala 2.11. @@ -162,11 +164,9 @@ the `spark-parent` module). Thus, the full flow for running continuous-compilation of the `core` submodule may look more like: -``` - $ mvn install - $ cd core - $ mvn scala:cc -``` + $ mvn install + $ cd core + $ mvn scala:cc # Building Spark with IntelliJ IDEA or Eclipse @@ -192,11 +192,11 @@ then ship it over to the cluster. We are investigating the exact cause for this. # Packaging without Hadoop Dependencies for YARN -The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath. The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself. +The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with `yarn.application.classpath`. The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself. # Building with SBT -Maven is the official recommendation for packaging Spark, and is the "build of reference". +Maven is the official build tool recommended for packaging Spark, and is the *build of reference*. But SBT is supported for day-to-day development since it can provide much faster iterative compilation. More advanced developers may wish to use SBT. diff --git a/docs/cluster-overview.md b/docs/cluster-overview.md index 7079de546e2f..faaf154d243f 100644 --- a/docs/cluster-overview.md +++ b/docs/cluster-overview.md @@ -5,18 +5,19 @@ title: Cluster Mode Overview This document gives a short overview of how Spark runs on clusters, to make it easier to understand the components involved. Read through the [application submission guide](submitting-applications.html) -to submit applications to a cluster. +to learn about launching applications on a cluster. # Components -Spark applications run as independent sets of processes on a cluster, coordinated by the SparkContext +Spark applications run as independent sets of processes on a cluster, coordinated by the `SparkContext` object in your main program (called the _driver program_). + Specifically, to run on a cluster, the SparkContext can connect to several types of _cluster managers_ -(either Spark's own standalone cluster manager or Mesos/YARN), which allocate resources across +(either Spark's own standalone cluster manager, Mesos or YARN), which allocate resources across applications. Once connected, Spark acquires *executors* on nodes in the cluster, which are processes that run computations and store data for your application. Next, it sends your application code (defined by JAR or Python files passed to SparkContext) to -the executors. Finally, SparkContext sends *tasks* for the executors to run. +the executors. Finally, SparkContext sends *tasks* to the executors to run.

Spark cluster components @@ -33,9 +34,9 @@ There are several useful things to note about this architecture: 2. Spark is agnostic to the underlying cluster manager. As long as it can acquire executor processes, and these communicate with each other, it is relatively easy to run it even on a cluster manager that also supports other applications (e.g. Mesos/YARN). -3. The driver program must listen for and accept incoming connections from its executors throughout - its lifetime (e.g., see [spark.driver.port and spark.fileserver.port in the network config - section](configuration.html#networking)). As such, the driver program must be network +3. The driver program must listen for and accept incoming connections from its executors throughout + its lifetime (e.g., see [spark.driver.port and spark.fileserver.port in the network config + section](configuration.html#networking)). As such, the driver program must be network addressable from the worker nodes. 4. Because the driver schedules tasks on the cluster, it should be run close to the worker nodes, preferably on the same local area network. If you'd like to send requests to the diff --git a/docs/configuration.md b/docs/configuration.md index 24b606356a14..e1fa321da3d0 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -389,7 +389,8 @@ Apart from these, the following properties are also available, and may be useful Implementation to use for transferring shuffle and cached blocks between executors. There are two implementations available: netty and nio. Netty-based block transfer is intended to be simpler but equally efficient and is the default option - starting in 1.2. + starting in 1.2, and nio block transfer is deprecated in Spark 1.5.0 and will + be removed in Spark 1.6.0.

@@ -457,9 +458,12 @@ Apart from these, the following properties are also available, and may be useful @@ -473,6 +477,25 @@ Apart from these, the following properties are also available, and may be useful spark.storage.memoryFraction. + + + + + + + + + + @@ -886,16 +909,6 @@ Apart from these, the following properties are also available, and may be useful #### Networking
spark.shuffle.manager sort - Implementation to use for shuffling data. There are two implementations available: - sort and hash. Sort-based shuffle is more memory-efficient and is - the default option starting in 1.2. + Implementation to use for shuffling data. There are three implementations available: + sort, hash and the new (1.5+) tungsten-sort. + Sort-based shuffle is more memory-efficient and is the default option starting in 1.2. + Tungsten-sort is similar to the sort based shuffle, with a direct binary cache-friendly + implementation with a fall back to regular sort based shuffle if its requirements are not + met.
spark.shuffle.service.enabledfalse + Enables the external shuffle service. This service preserves the shuffle files written by + executors so the executors can be safely removed. This must be enabled if + spark.dynamicAllocation.enabled is "true". The external shuffle service + must be set up in order to enable it. See + dynamic allocation + configuration and setup documentation for more information. +
spark.shuffle.service.port7337 + Port on which the external shuffle service will run. +
spark.shuffle.sort.bypassMergeThreshold 200
- - - - - @@ -1427,6 +1440,19 @@ Apart from these, the following properties are also available, and may be useful #### Spark Streaming
Property NameDefaultMeaning
spark.akka.failure-detector.threshold300.0 - This is set to a larger value to disable failure detector that comes inbuilt akka. It can be - enabled again, if you plan to use this feature (Not recommended). This maps to akka's - `akka.remote.transport-failure-detector.threshold`. Tune this in combination of - `spark.akka.heartbeat.pauses` and `spark.akka.heartbeat.interval` if you need to. -
spark.akka.frameSize 128
+ + + + + @@ -1541,7 +1567,11 @@ The following variables can be set in `spark-env.sh`: - + + + + + diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 99f8c827f767..c861a763d622 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -768,16 +768,14 @@ class GraphOps[VD, ED] { // Loop until no messages remain or maxIterations is achieved var i = 0 while (activeMessages > 0 && i < maxIterations) { - // Receive the messages: ----------------------------------------------------------------------- - // Run the vertex program on all vertices that receive messages - val newVerts = g.vertices.innerJoin(messages)(vprog).cache() - // Merge the new vertex values back into the graph - g = g.outerJoinVertices(newVerts) { (vid, old, newOpt) => newOpt.getOrElse(old) }.cache() - // Send Messages: ------------------------------------------------------------------------------ - // Vertices that didn't receive a message above don't appear in newVerts and therefore don't - // get to send messages. More precisely the map phase of mapReduceTriplets is only invoked - // on edges in the activeDir of vertices in newVerts - messages = g.mapReduceTriplets(sendMsg, mergeMsg, Some((newVerts, activeDir))).cache() + // Receive the messages and update the vertices. + g = g.joinVertices(messages)(vprog).cache() + val oldMessages = messages + // Send new messages, skipping edges where neither side received a message. We must cache + // messages so it can be materialized on the next line, allowing us to uncache the previous + // iteration. + messages = g.mapReduceTriplets( + sendMsg, mergeMsg, Some((oldMessages, activeDirection))).cache() activeMessages = messages.count() i += 1 } diff --git a/docs/ml-ann.md b/docs/ml-ann.md new file mode 100644 index 000000000000..d5ddd92af1e9 --- /dev/null +++ b/docs/ml-ann.md @@ -0,0 +1,123 @@ +--- +layout: global +title: Multilayer perceptron classifier - ML +displayTitle: ML - Multilayer perceptron classifier +--- + + +`\[ +\newcommand{\R}{\mathbb{R}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\x}{\mathbf{x}} +\newcommand{\y}{\mathbf{y}} +\newcommand{\wv}{\mathbf{w}} +\newcommand{\av}{\mathbf{\alpha}} +\newcommand{\bv}{\mathbf{b}} +\newcommand{\N}{\mathbb{N}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} +\newcommand{\zero}{\mathbf{0}} +\]` + + +Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). +MLPC consists of multiple layers of nodes. +Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes maps inputs to the outputs +by performing linear combination of the inputs with the node's weights `$\wv$` and bias `$\bv$` and applying an activation function. +It can be written in matrix form for MLPC with `$K+1$` layers as follows: +`\[ +\mathrm{y}(\x) = \mathrm{f_K}(...\mathrm{f_2}(\wv_2^T\mathrm{f_1}(\wv_1^T \x+b_1)+b_2)...+b_K) +\]` +Nodes in intermediate layers use sigmoid (logistic) function: +`\[ +\mathrm{f}(z_i) = \frac{1}{1 + e^{-z_i}} +\]` +Nodes in the output layer use softmax function: +`\[ +\mathrm{f}(z_i) = \frac{e^{z_i}}{\sum_{k=1}^N e^{z_k}} +\]` +The number of nodes `$N$` in the output layer corresponds to the number of classes. + +MLPC employes backpropagation for learning the model. We use logistic loss function for optimization and L-BFGS as optimization routine. + +**Examples** + +
+ +
+ +{% highlight scala %} +import org.apache.spark.ml.classification.MultilayerPerceptronClassifier +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.sql.Row + +// Load training data +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt").toDF() +// Split the data into train and test +val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L) +val train = splits(0) +val test = splits(1) +// specify layers for the neural network: +// input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) +val layers = Array[Int](4, 5, 4, 3) +// create the trainer and set its parameters +val trainer = new MultilayerPerceptronClassifier() + .setLayers(layers) + .setBlockSize(128) + .setSeed(1234L) + .setMaxIter(100) +// train the model +val model = trainer.fit(train) +// compute precision on the test set +val result = model.transform(test) +val predictionAndLabels = result.select("prediction", "label") +val evaluator = new MulticlassClassificationEvaluator() + .setMetricName("precision") +println("Precision:" + evaluator.evaluate(predictionAndLabels)) +{% endhighlight %} + +
+ +
+ +{% highlight java %} +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel; +import org.apache.spark.ml.classification.MultilayerPerceptronClassifier; +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; + +// Load training data +String path = "data/mllib/sample_multiclass_classification_data.txt"; +JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); +DataFrame dataFrame = sqlContext.createDataFrame(data, LabeledPoint.class); +// Split the data into train and test +DataFrame[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); +DataFrame train = splits[0]; +DataFrame test = splits[1]; +// specify layers for the neural network: +// input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) +int[] layers = new int[] {4, 5, 4, 3}; +// create the trainer and set its parameters +MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier() + .setLayers(layers) + .setBlockSize(128) + .setSeed(1234L) + .setMaxIter(100); +// train the model +MultilayerPerceptronClassificationModel model = trainer.fit(train); +// compute precision on the test set +DataFrame result = model.transform(test); +DataFrame predictionAndLabels = result.select("prediction", "label"); +MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() + .setMetricName("precision"); +System.out.println("Precision = " + evaluator.evaluate(predictionAndLabels)); +{% endhighlight %} +
+ +
diff --git a/docs/ml-decision-tree.md b/docs/ml-decision-tree.md new file mode 100644 index 000000000000..542819e93e6d --- /dev/null +++ b/docs/ml-decision-tree.md @@ -0,0 +1,493 @@ +--- +layout: global +title: Decision Trees - SparkML +displayTitle: ML - Decision Trees +--- + +**Table of Contents** + +* This will become a table of contents (this text will be scraped). +{:toc} + + +# Overview + +[Decision trees](http://en.wikipedia.org/wiki/Decision_tree_learning) +and their ensembles are popular methods for the machine learning tasks of +classification and regression. Decision trees are widely used since they are easy to interpret, +handle categorical features, extend to the multiclass classification setting, do not require +feature scaling, and are able to capture non-linearities and feature interactions. Tree ensemble +algorithms such as random forests and boosting are among the top performers for classification and +regression tasks. + +MLlib supports decision trees for binary and multiclass classification and for regression, +using both continuous and categorical features. The implementation partitions data by rows, +allowing distributed training with millions or even billions of instances. + +Users can find more information about the decision tree algorithm in the [MLlib Decision Tree guide](mllib-decision-tree.html). In this section, we demonstrate the Pipelines API for Decision Trees. + +The Pipelines API for Decision Trees offers a bit more functionality than the original API. In particular, for classification, users can get the predicted probability of each class (a.k.a. class conditional probabilities). + +Ensembles of trees (Random Forests and Gradient-Boosted Trees) are described in the [Ensembles guide](ml-ensembles.html). + +# Inputs and Outputs + +We list the input and output (prediction) column types here. +All output columns are optional; to exclude an output column, set its corresponding Param to an empty string. + +## Input Columns + +
Property NameDefaultMeaning
spark.streaming.backpressure.enabledfalse + Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). + This enables the Spark Streaming to control the receiving rate based on the + current batch scheduling delays and processing times so that the system receives + only as fast as the system can process. Internally, this dynamically sets the + maximum receiving rate of receivers. This rate is upper bounded by the values + `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` + if they are set (see below). +
spark.streaming.blockInterval 200ms
PYSPARK_PYTHONPython binary executable to use for PySpark.Python binary executable to use for PySpark in both driver and workers (default is `python`).
PYSPARK_DRIVER_PYTHONPython binary executable to use for PySpark in driver only (default is PYSPARK_PYTHON).
SPARK_LOCAL_IP
+ + + + + + + + + + + + + + + + + + + + + + +
Param nameType(s)DefaultDescription
labelColDouble"label"Label to predict
featuresColVector"features"Feature vector
+ +## Output Columns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Param nameType(s)DefaultDescriptionNotes
predictionColDouble"prediction"Predicted label
rawPredictionColVector"rawPrediction"Vector of length # classes, with the counts of training instance labels at the tree node which makes the predictionClassification only
probabilityColVector"probability"Vector of length # classes equal to rawPrediction normalized to a multinomial distributionClassification only
+ +# Examples + +The below examples demonstrate the Pipelines API for Decision Trees. The main differences between this API and the [original MLlib Decision Tree API](mllib-decision-tree.html) are: + +* support for ML Pipelines +* separation of Decision Trees for classification vs. regression +* use of DataFrame metadata to distinguish continuous and categorical features + + +## Classification + +The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set. +We use two feature transformers to prepare the data; these help index categories for the label and categorical features, adding metadata to the `DataFrame` which the Decision Tree algorithm can recognize. + +
+
+ +More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.classification.DecisionTreeClassifier). + +{% highlight scala %} +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.classification.DecisionTreeClassifier +import org.apache.spark.ml.classification.DecisionTreeClassificationModel +import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer} +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.apache.spark.mllib.util.MLUtils + +// Load and parse the data file, converting it to a DataFrame. +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +// Index labels, adding metadata to the label column. +// Fit on whole dataset to include all labels in index. +val labelIndexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("indexedLabel") + .fit(data) +// Automatically identify categorical features, and index them. +val featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) // features with > 4 distinct values are treated as continuous + .fit(data) + +// Split the data into training and test sets (30% held out for testing) +val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) + +// Train a DecisionTree model. +val dt = new DecisionTreeClassifier() + .setLabelCol("indexedLabel") + .setFeaturesCol("indexedFeatures") + +// Convert indexed labels back to original labels. +val labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("predictedLabel") + .setLabels(labelIndexer.labels) + +// Chain indexers and tree in a Pipeline +val pipeline = new Pipeline() + .setStages(Array(labelIndexer, featureIndexer, dt, labelConverter)) + +// Train model. This also runs the indexers. +val model = pipeline.fit(trainingData) + +// Make predictions. +val predictions = model.transform(testData) + +// Select example rows to display. +predictions.select("predictedLabel", "label", "features").show(5) + +// Select (prediction, true label) and compute test error +val evaluator = new MulticlassClassificationEvaluator() + .setLabelCol("indexedLabel") + .setPredictionCol("prediction") + .setMetricName("precision") +val accuracy = evaluator.evaluate(predictions) +println("Test Error = " + (1.0 - accuracy)) + +val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel] +println("Learned classification tree model:\n" + treeModel.toDebugString) +{% endhighlight %} +
+ +
+ +More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/classification/DecisionTreeClassifier.html). + +{% highlight java %} +import org.apache.spark.ml.Pipeline; +import org.apache.spark.ml.PipelineModel; +import org.apache.spark.ml.PipelineStage; +import org.apache.spark.ml.classification.DecisionTreeClassifier; +import org.apache.spark.ml.classification.DecisionTreeClassificationModel; +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; +import org.apache.spark.ml.feature.*; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.DataFrame; + +// Load and parse the data file, converting it to a DataFrame. +RDD rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); +DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); + +// Index labels, adding metadata to the label column. +// Fit on whole dataset to include all labels in index. +StringIndexerModel labelIndexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("indexedLabel") + .fit(data); +// Automatically identify categorical features, and index them. +VectorIndexerModel featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) // features with > 4 distinct values are treated as continuous + .fit(data); + +// Split the data into training and test sets (30% held out for testing) +DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3}); +DataFrame trainingData = splits[0]; +DataFrame testData = splits[1]; + +// Train a DecisionTree model. +DecisionTreeClassifier dt = new DecisionTreeClassifier() + .setLabelCol("indexedLabel") + .setFeaturesCol("indexedFeatures"); + +// Convert indexed labels back to original labels. +IndexToString labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("predictedLabel") + .setLabels(labelIndexer.labels()); + +// Chain indexers and tree in a Pipeline +Pipeline pipeline = new Pipeline() + .setStages(new PipelineStage[] {labelIndexer, featureIndexer, dt, labelConverter}); + +// Train model. This also runs the indexers. +PipelineModel model = pipeline.fit(trainingData); + +// Make predictions. +DataFrame predictions = model.transform(testData); + +// Select example rows to display. +predictions.select("predictedLabel", "label", "features").show(5); + +// Select (prediction, true label) and compute test error +MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() + .setLabelCol("indexedLabel") + .setPredictionCol("prediction") + .setMetricName("precision"); +double accuracy = evaluator.evaluate(predictions); +System.out.println("Test Error = " + (1.0 - accuracy)); + +DecisionTreeClassificationModel treeModel = + (DecisionTreeClassificationModel)(model.stages()[2]); +System.out.println("Learned classification tree model:\n" + treeModel.toDebugString()); +{% endhighlight %} +
+ +
+ +More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.classification.DecisionTreeClassifier). + +{% highlight python %} +from pyspark.ml import Pipeline +from pyspark.ml.classification import DecisionTreeClassifier +from pyspark.ml.feature import StringIndexer, VectorIndexer +from pyspark.ml.evaluation import MulticlassClassificationEvaluator +from pyspark.mllib.util import MLUtils + +# Load and parse the data file, converting it to a DataFrame. +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +# Index labels, adding metadata to the label column. +# Fit on whole dataset to include all labels in index. +labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) +# Automatically identify categorical features, and index them. +# We specify maxCategories so features with > 4 distinct values are treated as continuous. +featureIndexer =\ + VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) + +# Split the data into training and test sets (30% held out for testing) +(trainingData, testData) = data.randomSplit([0.7, 0.3]) + +# Train a DecisionTree model. +dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") + +# Chain indexers and tree in a Pipeline +pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) + +# Train model. This also runs the indexers. +model = pipeline.fit(trainingData) + +# Make predictions. +predictions = model.transform(testData) + +# Select example rows to display. +predictions.select("prediction", "indexedLabel", "features").show(5) + +# Select (prediction, true label) and compute test error +evaluator = MulticlassClassificationEvaluator( + labelCol="indexedLabel", predictionCol="prediction", metricName="precision") +accuracy = evaluator.evaluate(predictions) +print "Test Error = %g" % (1.0 - accuracy) + +treeModel = model.stages[2] +print treeModel # summary only +{% endhighlight %} +
+ +
+ + +## Regression + +The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set. +We use a feature transformer to index categorical features, adding metadata to the `DataFrame` which the Decision Tree algorithm can recognize. + +
+
+ +More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.regression.DecisionTreeRegressor). + +{% highlight scala %} +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.regression.DecisionTreeRegressor +import org.apache.spark.ml.regression.DecisionTreeRegressionModel +import org.apache.spark.ml.feature.VectorIndexer +import org.apache.spark.ml.evaluation.RegressionEvaluator +import org.apache.spark.mllib.util.MLUtils + +// Load and parse the data file, converting it to a DataFrame. +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +// Automatically identify categorical features, and index them. +// Here, we treat features with > 4 distinct values as continuous. +val featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) + .fit(data) + +// Split the data into training and test sets (30% held out for testing) +val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) + +// Train a DecisionTree model. +val dt = new DecisionTreeRegressor() + .setLabelCol("label") + .setFeaturesCol("indexedFeatures") + +// Chain indexer and tree in a Pipeline +val pipeline = new Pipeline() + .setStages(Array(featureIndexer, dt)) + +// Train model. This also runs the indexer. +val model = pipeline.fit(trainingData) + +// Make predictions. +val predictions = model.transform(testData) + +// Select example rows to display. +predictions.select("prediction", "label", "features").show(5) + +// Select (prediction, true label) and compute test error +val evaluator = new RegressionEvaluator() + .setLabelCol("label") + .setPredictionCol("prediction") + .setMetricName("rmse") +val rmse = evaluator.evaluate(predictions) +println("Root Mean Squared Error (RMSE) on test data = " + rmse) + +val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel] +println("Learned regression tree model:\n" + treeModel.toDebugString) +{% endhighlight %} +
+ +
+ +More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/regression/DecisionTreeRegressor.html). + +{% highlight java %} +import org.apache.spark.ml.Pipeline; +import org.apache.spark.ml.PipelineModel; +import org.apache.spark.ml.PipelineStage; +import org.apache.spark.ml.evaluation.RegressionEvaluator; +import org.apache.spark.ml.feature.VectorIndexer; +import org.apache.spark.ml.feature.VectorIndexerModel; +import org.apache.spark.ml.regression.DecisionTreeRegressionModel; +import org.apache.spark.ml.regression.DecisionTreeRegressor; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.DataFrame; + +// Load and parse the data file, converting it to a DataFrame. +RDD rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); +DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); + +// Automatically identify categorical features, and index them. +// Set maxCategories so features with > 4 distinct values are treated as continuous. +VectorIndexerModel featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) + .fit(data); + +// Split the data into training and test sets (30% held out for testing) +DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3}); +DataFrame trainingData = splits[0]; +DataFrame testData = splits[1]; + +// Train a DecisionTree model. +DecisionTreeRegressor dt = new DecisionTreeRegressor() + .setFeaturesCol("indexedFeatures"); + +// Chain indexer and tree in a Pipeline +Pipeline pipeline = new Pipeline() + .setStages(new PipelineStage[] {featureIndexer, dt}); + +// Train model. This also runs the indexer. +PipelineModel model = pipeline.fit(trainingData); + +// Make predictions. +DataFrame predictions = model.transform(testData); + +// Select example rows to display. +predictions.select("label", "features").show(5); + +// Select (prediction, true label) and compute test error +RegressionEvaluator evaluator = new RegressionEvaluator() + .setLabelCol("label") + .setPredictionCol("prediction") + .setMetricName("rmse"); +double rmse = evaluator.evaluate(predictions); +System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse); + +DecisionTreeRegressionModel treeModel = + (DecisionTreeRegressionModel)(model.stages()[1]); +System.out.println("Learned regression tree model:\n" + treeModel.toDebugString()); +{% endhighlight %} +
+ +
+ +More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.regression.DecisionTreeRegressor). + +{% highlight python %} +from pyspark.ml import Pipeline +from pyspark.ml.regression import DecisionTreeRegressor +from pyspark.ml.feature import VectorIndexer +from pyspark.ml.evaluation import RegressionEvaluator +from pyspark.mllib.util import MLUtils + +# Load and parse the data file, converting it to a DataFrame. +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +# Automatically identify categorical features, and index them. +# We specify maxCategories so features with > 4 distinct values are treated as continuous. +featureIndexer =\ + VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) + +# Split the data into training and test sets (30% held out for testing) +(trainingData, testData) = data.randomSplit([0.7, 0.3]) + +# Train a DecisionTree model. +dt = DecisionTreeRegressor(featuresCol="indexedFeatures") + +# Chain indexer and tree in a Pipeline +pipeline = Pipeline(stages=[featureIndexer, dt]) + +# Train model. This also runs the indexer. +model = pipeline.fit(trainingData) + +# Make predictions. +predictions = model.transform(testData) + +# Select example rows to display. +predictions.select("prediction", "label", "features").show(5) + +# Select (prediction, true label) and compute test error +evaluator = RegressionEvaluator( + labelCol="label", predictionCol="prediction", metricName="rmse") +rmse = evaluator.evaluate(predictions) +print "Root Mean Squared Error (RMSE) on test data = %g" % rmse + +treeModel = model.stages[1] +print treeModel # summary only +{% endhighlight %} +
+ +
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index 9ff50e95fc47..62749909e01d 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -11,11 +11,947 @@ displayTitle: ML - Ensembles An [ensemble method](http://en.wikipedia.org/wiki/Ensemble_learning) is a learning algorithm which creates a model composed of a set of other base models. -The Pipelines API supports the following ensemble algorithms: [`OneVsRest`](api/scala/index.html#org.apache.spark.ml.classifier.OneVsRest) -## OneVsRest +## Tree Ensembles -[OneVsRest](http://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) is an example of a machine learning reduction for performing multiclass classification given a base classifier that can perform binary classification efficiently. +The Pipelines API supports two major tree ensemble algorithms: [Random Forests](http://en.wikipedia.org/wiki/Random_forest) and [Gradient-Boosted Trees (GBTs)](http://en.wikipedia.org/wiki/Gradient_boosting). +Both use [MLlib decision trees](ml-decision-tree.html) as their base models. + +Users can find more information about ensemble algorithms in the [MLlib Ensemble guide](mllib-ensembles.html). In this section, we demonstrate the Pipelines API for ensembles. + +The main differences between this API and the [original MLlib ensembles API](mllib-ensembles.html) are: +* support for ML Pipelines +* separation of classification vs. regression +* use of DataFrame metadata to distinguish continuous and categorical features +* a bit more functionality for random forests: estimates of feature importance, as well as the predicted probability of each class (a.k.a. class conditional probabilities) for classification. + +### Random Forests + +[Random forests](http://en.wikipedia.org/wiki/Random_forest) +are ensembles of [decision trees](ml-decision-tree.html). +Random forests combine many decision trees in order to reduce the risk of overfitting. +MLlib supports random forests for binary and multiclass classification and for regression, +using both continuous and categorical features. + +This section gives examples of using random forests with the Pipelines API. +For more information on the algorithm, please see the [main MLlib docs on random forests](mllib-ensembles.html). + +#### Inputs and Outputs + +We list the input and output (prediction) column types here. +All output columns are optional; to exclude an output column, set its corresponding Param to an empty string. + +##### Input Columns + + + + + + + + + + + + + + + + + + + + + + + + +
Param nameType(s)DefaultDescription
labelColDouble"label"Label to predict
featuresColVector"features"Feature vector
+ +##### Output Columns (Predictions) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Param nameType(s)DefaultDescriptionNotes
predictionColDouble"prediction"Predicted label
rawPredictionColVector"rawPrediction"Vector of length # classes, with the counts of training instance labels at the tree node which makes the predictionClassification only
probabilityColVector"probability"Vector of length # classes equal to rawPrediction normalized to a multinomial distributionClassification only
+ +#### Example: Classification + +The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set. +We use two feature transformers to prepare the data; these help index categories for the label and categorical features, adding metadata to the `DataFrame` which the tree-based algorithms can recognize. + +
+
+ +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.RandomForestClassifier) for more details. + +{% highlight scala %} +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.classification.RandomForestClassifier +import org.apache.spark.ml.classification.RandomForestClassificationModel +import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer} +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.apache.spark.mllib.util.MLUtils + +// Load and parse the data file, converting it to a DataFrame. +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +// Index labels, adding metadata to the label column. +// Fit on whole dataset to include all labels in index. +val labelIndexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("indexedLabel") + .fit(data) +// Automatically identify categorical features, and index them. +// Set maxCategories so features with > 4 distinct values are treated as continuous. +val featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) + .fit(data) + +// Split the data into training and test sets (30% held out for testing) +val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) + +// Train a RandomForest model. +val rf = new RandomForestClassifier() + .setLabelCol("indexedLabel") + .setFeaturesCol("indexedFeatures") + .setNumTrees(10) + +// Convert indexed labels back to original labels. +val labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("predictedLabel") + .setLabels(labelIndexer.labels) + +// Chain indexers and forest in a Pipeline +val pipeline = new Pipeline() + .setStages(Array(labelIndexer, featureIndexer, rf, labelConverter)) + +// Train model. This also runs the indexers. +val model = pipeline.fit(trainingData) + +// Make predictions. +val predictions = model.transform(testData) + +// Select example rows to display. +predictions.select("predictedLabel", "label", "features").show(5) + +// Select (prediction, true label) and compute test error +val evaluator = new MulticlassClassificationEvaluator() + .setLabelCol("indexedLabel") + .setPredictionCol("prediction") + .setMetricName("precision") +val accuracy = evaluator.evaluate(predictions) +println("Test Error = " + (1.0 - accuracy)) + +val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel] +println("Learned classification forest model:\n" + rfModel.toDebugString) +{% endhighlight %} +
+ +
+ +Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/RandomForestClassifier.html) for more details. + +{% highlight java %} +import org.apache.spark.ml.Pipeline; +import org.apache.spark.ml.PipelineModel; +import org.apache.spark.ml.PipelineStage; +import org.apache.spark.ml.classification.RandomForestClassifier; +import org.apache.spark.ml.classification.RandomForestClassificationModel; +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; +import org.apache.spark.ml.feature.*; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.DataFrame; + +// Load and parse the data file, converting it to a DataFrame. +RDD rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); +DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); + +// Index labels, adding metadata to the label column. +// Fit on whole dataset to include all labels in index. +StringIndexerModel labelIndexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("indexedLabel") + .fit(data); +// Automatically identify categorical features, and index them. +// Set maxCategories so features with > 4 distinct values are treated as continuous. +VectorIndexerModel featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) + .fit(data); + +// Split the data into training and test sets (30% held out for testing) +DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3}); +DataFrame trainingData = splits[0]; +DataFrame testData = splits[1]; + +// Train a RandomForest model. +RandomForestClassifier rf = new RandomForestClassifier() + .setLabelCol("indexedLabel") + .setFeaturesCol("indexedFeatures"); + +// Convert indexed labels back to original labels. +IndexToString labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("predictedLabel") + .setLabels(labelIndexer.labels()); + +// Chain indexers and forest in a Pipeline +Pipeline pipeline = new Pipeline() + .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter}); + +// Train model. This also runs the indexers. +PipelineModel model = pipeline.fit(trainingData); + +// Make predictions. +DataFrame predictions = model.transform(testData); + +// Select example rows to display. +predictions.select("predictedLabel", "label", "features").show(5); + +// Select (prediction, true label) and compute test error +MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() + .setLabelCol("indexedLabel") + .setPredictionCol("prediction") + .setMetricName("precision"); +double accuracy = evaluator.evaluate(predictions); +System.out.println("Test Error = " + (1.0 - accuracy)); + +RandomForestClassificationModel rfModel = + (RandomForestClassificationModel)(model.stages()[2]); +System.out.println("Learned classification forest model:\n" + rfModel.toDebugString()); +{% endhighlight %} +
+ +
+ +Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.RandomForestClassifier) for more details. + +{% highlight python %} +from pyspark.ml import Pipeline +from pyspark.ml.classification import RandomForestClassifier +from pyspark.ml.feature import StringIndexer, VectorIndexer +from pyspark.ml.evaluation import MulticlassClassificationEvaluator +from pyspark.mllib.util import MLUtils + +# Load and parse the data file, converting it to a DataFrame. +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +# Index labels, adding metadata to the label column. +# Fit on whole dataset to include all labels in index. +labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) +# Automatically identify categorical features, and index them. +# Set maxCategories so features with > 4 distinct values are treated as continuous. +featureIndexer =\ + VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) + +# Split the data into training and test sets (30% held out for testing) +(trainingData, testData) = data.randomSplit([0.7, 0.3]) + +# Train a RandomForest model. +rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") + +# Chain indexers and forest in a Pipeline +pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) + +# Train model. This also runs the indexers. +model = pipeline.fit(trainingData) + +# Make predictions. +predictions = model.transform(testData) + +# Select example rows to display. +predictions.select("prediction", "indexedLabel", "features").show(5) + +# Select (prediction, true label) and compute test error +evaluator = MulticlassClassificationEvaluator( + labelCol="indexedLabel", predictionCol="prediction", metricName="precision") +accuracy = evaluator.evaluate(predictions) +print "Test Error = %g" % (1.0 - accuracy) + +rfModel = model.stages[2] +print rfModel # summary only +{% endhighlight %} +
+
+ +#### Example: Regression + +The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set. +We use a feature transformer to index categorical features, adding metadata to the `DataFrame` which the tree-based algorithms can recognize. + +
+
+ +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.RandomForestRegressor) for more details. + +{% highlight scala %} +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.regression.RandomForestRegressor +import org.apache.spark.ml.regression.RandomForestRegressionModel +import org.apache.spark.ml.feature.VectorIndexer +import org.apache.spark.ml.evaluation.RegressionEvaluator +import org.apache.spark.mllib.util.MLUtils + +// Load and parse the data file, converting it to a DataFrame. +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +// Automatically identify categorical features, and index them. +// Set maxCategories so features with > 4 distinct values are treated as continuous. +val featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) + .fit(data) + +// Split the data into training and test sets (30% held out for testing) +val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) + +// Train a RandomForest model. +val rf = new RandomForestRegressor() + .setLabelCol("label") + .setFeaturesCol("indexedFeatures") + +// Chain indexer and forest in a Pipeline +val pipeline = new Pipeline() + .setStages(Array(featureIndexer, rf)) + +// Train model. This also runs the indexer. +val model = pipeline.fit(trainingData) + +// Make predictions. +val predictions = model.transform(testData) + +// Select example rows to display. +predictions.select("prediction", "label", "features").show(5) + +// Select (prediction, true label) and compute test error +val evaluator = new RegressionEvaluator() + .setLabelCol("label") + .setPredictionCol("prediction") + .setMetricName("rmse") +val rmse = evaluator.evaluate(predictions) +println("Root Mean Squared Error (RMSE) on test data = " + rmse) + +val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel] +println("Learned regression forest model:\n" + rfModel.toDebugString) +{% endhighlight %} +
+ +
+ +Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/RandomForestRegressor.html) for more details. + +{% highlight java %} +import org.apache.spark.ml.Pipeline; +import org.apache.spark.ml.PipelineModel; +import org.apache.spark.ml.PipelineStage; +import org.apache.spark.ml.evaluation.RegressionEvaluator; +import org.apache.spark.ml.feature.VectorIndexer; +import org.apache.spark.ml.feature.VectorIndexerModel; +import org.apache.spark.ml.regression.RandomForestRegressionModel; +import org.apache.spark.ml.regression.RandomForestRegressor; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.DataFrame; + +// Load and parse the data file, converting it to a DataFrame. +RDD rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); +DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); + +// Automatically identify categorical features, and index them. +// Set maxCategories so features with > 4 distinct values are treated as continuous. +VectorIndexerModel featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) + .fit(data); + +// Split the data into training and test sets (30% held out for testing) +DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3}); +DataFrame trainingData = splits[0]; +DataFrame testData = splits[1]; + +// Train a RandomForest model. +RandomForestRegressor rf = new RandomForestRegressor() + .setLabelCol("label") + .setFeaturesCol("indexedFeatures"); + +// Chain indexer and forest in a Pipeline +Pipeline pipeline = new Pipeline() + .setStages(new PipelineStage[] {featureIndexer, rf}); + +// Train model. This also runs the indexer. +PipelineModel model = pipeline.fit(trainingData); + +// Make predictions. +DataFrame predictions = model.transform(testData); + +// Select example rows to display. +predictions.select("prediction", "label", "features").show(5); + +// Select (prediction, true label) and compute test error +RegressionEvaluator evaluator = new RegressionEvaluator() + .setLabelCol("label") + .setPredictionCol("prediction") + .setMetricName("rmse"); +double rmse = evaluator.evaluate(predictions); +System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse); + +RandomForestRegressionModel rfModel = + (RandomForestRegressionModel)(model.stages()[1]); +System.out.println("Learned regression forest model:\n" + rfModel.toDebugString()); +{% endhighlight %} +
+ +
+ +Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.RandomForestRegressor) for more details. + +{% highlight python %} +from pyspark.ml import Pipeline +from pyspark.ml.regression import RandomForestRegressor +from pyspark.ml.feature import VectorIndexer +from pyspark.ml.evaluation import RegressionEvaluator +from pyspark.mllib.util import MLUtils + +# Load and parse the data file, converting it to a DataFrame. +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +# Automatically identify categorical features, and index them. +# Set maxCategories so features with > 4 distinct values are treated as continuous. +featureIndexer =\ + VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) + +# Split the data into training and test sets (30% held out for testing) +(trainingData, testData) = data.randomSplit([0.7, 0.3]) + +# Train a RandomForest model. +rf = RandomForestRegressor(featuresCol="indexedFeatures") + +# Chain indexer and forest in a Pipeline +pipeline = Pipeline(stages=[featureIndexer, rf]) + +# Train model. This also runs the indexer. +model = pipeline.fit(trainingData) + +# Make predictions. +predictions = model.transform(testData) + +# Select example rows to display. +predictions.select("prediction", "label", "features").show(5) + +# Select (prediction, true label) and compute test error +evaluator = RegressionEvaluator( + labelCol="label", predictionCol="prediction", metricName="rmse") +rmse = evaluator.evaluate(predictions) +print "Root Mean Squared Error (RMSE) on test data = %g" % rmse + +rfModel = model.stages[1] +print rfModel # summary only +{% endhighlight %} +
+
+ +### Gradient-Boosted Trees (GBTs) + +[Gradient-Boosted Trees (GBTs)](http://en.wikipedia.org/wiki/Gradient_boosting) +are ensembles of [decision trees](ml-decision-tree.html). +GBTs iteratively train decision trees in order to minimize a loss function. +MLlib supports GBTs for binary classification and for regression, +using both continuous and categorical features. + +This section gives examples of using GBTs with the Pipelines API. +For more information on the algorithm, please see the [main MLlib docs on GBTs](mllib-ensembles.html). + +#### Inputs and Outputs + +We list the input and output (prediction) column types here. +All output columns are optional; to exclude an output column, set its corresponding Param to an empty string. + +##### Input Columns + + + + + + + + + + + + + + + + + + + + + + + + +
Param nameType(s)DefaultDescription
labelColDouble"label"Label to predict
featuresColVector"features"Feature vector
+ +Note that `GBTClassifier` currently only supports binary labels. + +##### Output Columns (Predictions) + + + + + + + + + + + + + + + + + + + + +
Param nameType(s)DefaultDescriptionNotes
predictionColDouble"prediction"Predicted label
+ +In the future, `GBTClassifier` will also output columns for `rawPrediction` and `probability`, just as `RandomForestClassifier` does. + +#### Example: Classification + +The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set. +We use two feature transformers to prepare the data; these help index categories for the label and categorical features, adding metadata to the `DataFrame` which the tree-based algorithms can recognize. + +
+
+ +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.GBTClassifier) for more details. + +{% highlight scala %} +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.classification.GBTClassifier +import org.apache.spark.ml.classification.GBTClassificationModel +import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer} +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.apache.spark.mllib.util.MLUtils + +// Load and parse the data file, converting it to a DataFrame. +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +// Index labels, adding metadata to the label column. +// Fit on whole dataset to include all labels in index. +val labelIndexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("indexedLabel") + .fit(data) +// Automatically identify categorical features, and index them. +// Set maxCategories so features with > 4 distinct values are treated as continuous. +val featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) + .fit(data) + +// Split the data into training and test sets (30% held out for testing) +val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) + +// Train a GBT model. +val gbt = new GBTClassifier() + .setLabelCol("indexedLabel") + .setFeaturesCol("indexedFeatures") + .setMaxIter(10) + +// Convert indexed labels back to original labels. +val labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("predictedLabel") + .setLabels(labelIndexer.labels) + +// Chain indexers and GBT in a Pipeline +val pipeline = new Pipeline() + .setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter)) + +// Train model. This also runs the indexers. +val model = pipeline.fit(trainingData) + +// Make predictions. +val predictions = model.transform(testData) + +// Select example rows to display. +predictions.select("predictedLabel", "label", "features").show(5) + +// Select (prediction, true label) and compute test error +val evaluator = new MulticlassClassificationEvaluator() + .setLabelCol("indexedLabel") + .setPredictionCol("prediction") + .setMetricName("precision") +val accuracy = evaluator.evaluate(predictions) +println("Test Error = " + (1.0 - accuracy)) + +val gbtModel = model.stages(2).asInstanceOf[GBTClassificationModel] +println("Learned classification GBT model:\n" + gbtModel.toDebugString) +{% endhighlight %} +
+ +
+ +Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/GBTClassifier.html) for more details. + +{% highlight java %} +import org.apache.spark.ml.Pipeline; +import org.apache.spark.ml.PipelineModel; +import org.apache.spark.ml.PipelineStage; +import org.apache.spark.ml.classification.GBTClassifier; +import org.apache.spark.ml.classification.GBTClassificationModel; +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; +import org.apache.spark.ml.feature.*; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.DataFrame; + +// Load and parse the data file, converting it to a DataFrame. +RDD rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); +DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); + +// Index labels, adding metadata to the label column. +// Fit on whole dataset to include all labels in index. +StringIndexerModel labelIndexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("indexedLabel") + .fit(data); +// Automatically identify categorical features, and index them. +// Set maxCategories so features with > 4 distinct values are treated as continuous. +VectorIndexerModel featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) + .fit(data); + +// Split the data into training and test sets (30% held out for testing) +DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3}); +DataFrame trainingData = splits[0]; +DataFrame testData = splits[1]; + +// Train a GBT model. +GBTClassifier gbt = new GBTClassifier() + .setLabelCol("indexedLabel") + .setFeaturesCol("indexedFeatures") + .setMaxIter(10); + +// Convert indexed labels back to original labels. +IndexToString labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("predictedLabel") + .setLabels(labelIndexer.labels()); + +// Chain indexers and GBT in a Pipeline +Pipeline pipeline = new Pipeline() + .setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter}); + +// Train model. This also runs the indexers. +PipelineModel model = pipeline.fit(trainingData); + +// Make predictions. +DataFrame predictions = model.transform(testData); + +// Select example rows to display. +predictions.select("predictedLabel", "label", "features").show(5); + +// Select (prediction, true label) and compute test error +MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() + .setLabelCol("indexedLabel") + .setPredictionCol("prediction") + .setMetricName("precision"); +double accuracy = evaluator.evaluate(predictions); +System.out.println("Test Error = " + (1.0 - accuracy)); + +GBTClassificationModel gbtModel = + (GBTClassificationModel)(model.stages()[2]); +System.out.println("Learned classification GBT model:\n" + gbtModel.toDebugString()); +{% endhighlight %} +
+ +
+ +Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.GBTClassifier) for more details. + +{% highlight python %} +from pyspark.ml import Pipeline +from pyspark.ml.classification import GBTClassifier +from pyspark.ml.feature import StringIndexer, VectorIndexer +from pyspark.ml.evaluation import MulticlassClassificationEvaluator +from pyspark.mllib.util import MLUtils + +# Load and parse the data file, converting it to a DataFrame. +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +# Index labels, adding metadata to the label column. +# Fit on whole dataset to include all labels in index. +labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) +# Automatically identify categorical features, and index them. +# Set maxCategories so features with > 4 distinct values are treated as continuous. +featureIndexer =\ + VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) + +# Split the data into training and test sets (30% held out for testing) +(trainingData, testData) = data.randomSplit([0.7, 0.3]) + +# Train a GBT model. +gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10) + +# Chain indexers and GBT in a Pipeline +pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt]) + +# Train model. This also runs the indexers. +model = pipeline.fit(trainingData) + +# Make predictions. +predictions = model.transform(testData) + +# Select example rows to display. +predictions.select("prediction", "indexedLabel", "features").show(5) + +# Select (prediction, true label) and compute test error +evaluator = MulticlassClassificationEvaluator( + labelCol="indexedLabel", predictionCol="prediction", metricName="precision") +accuracy = evaluator.evaluate(predictions) +print "Test Error = %g" % (1.0 - accuracy) + +gbtModel = model.stages[2] +print gbtModel # summary only +{% endhighlight %} +
+
+ +#### Example: Regression + +Note: For this example dataset, `GBTRegressor` actually only needs 1 iteration, but that will not +be true in general. + +
+
+ +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.GBTRegressor) for more details. + +{% highlight scala %} +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.regression.GBTRegressor +import org.apache.spark.ml.regression.GBTRegressionModel +import org.apache.spark.ml.feature.VectorIndexer +import org.apache.spark.ml.evaluation.RegressionEvaluator +import org.apache.spark.mllib.util.MLUtils + +// Load and parse the data file, converting it to a DataFrame. +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +// Automatically identify categorical features, and index them. +// Set maxCategories so features with > 4 distinct values are treated as continuous. +val featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) + .fit(data) + +// Split the data into training and test sets (30% held out for testing) +val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) + +// Train a GBT model. +val gbt = new GBTRegressor() + .setLabelCol("label") + .setFeaturesCol("indexedFeatures") + .setMaxIter(10) + +// Chain indexer and GBT in a Pipeline +val pipeline = new Pipeline() + .setStages(Array(featureIndexer, gbt)) + +// Train model. This also runs the indexer. +val model = pipeline.fit(trainingData) + +// Make predictions. +val predictions = model.transform(testData) + +// Select example rows to display. +predictions.select("prediction", "label", "features").show(5) + +// Select (prediction, true label) and compute test error +val evaluator = new RegressionEvaluator() + .setLabelCol("label") + .setPredictionCol("prediction") + .setMetricName("rmse") +val rmse = evaluator.evaluate(predictions) +println("Root Mean Squared Error (RMSE) on test data = " + rmse) + +val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel] +println("Learned regression GBT model:\n" + gbtModel.toDebugString) +{% endhighlight %} +
+ +
+ +Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/GBTRegressor.html) for more details. + +{% highlight java %} +import org.apache.spark.ml.Pipeline; +import org.apache.spark.ml.PipelineModel; +import org.apache.spark.ml.PipelineStage; +import org.apache.spark.ml.evaluation.RegressionEvaluator; +import org.apache.spark.ml.feature.VectorIndexer; +import org.apache.spark.ml.feature.VectorIndexerModel; +import org.apache.spark.ml.regression.GBTRegressionModel; +import org.apache.spark.ml.regression.GBTRegressor; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.DataFrame; + +// Load and parse the data file, converting it to a DataFrame. +RDD rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); +DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); + +// Automatically identify categorical features, and index them. +// Set maxCategories so features with > 4 distinct values are treated as continuous. +VectorIndexerModel featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setMaxCategories(4) + .fit(data); + +// Split the data into training and test sets (30% held out for testing) +DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3}); +DataFrame trainingData = splits[0]; +DataFrame testData = splits[1]; + +// Train a GBT model. +GBTRegressor gbt = new GBTRegressor() + .setLabelCol("label") + .setFeaturesCol("indexedFeatures") + .setMaxIter(10); + +// Chain indexer and GBT in a Pipeline +Pipeline pipeline = new Pipeline() + .setStages(new PipelineStage[] {featureIndexer, gbt}); + +// Train model. This also runs the indexer. +PipelineModel model = pipeline.fit(trainingData); + +// Make predictions. +DataFrame predictions = model.transform(testData); + +// Select example rows to display. +predictions.select("prediction", "label", "features").show(5); + +// Select (prediction, true label) and compute test error +RegressionEvaluator evaluator = new RegressionEvaluator() + .setLabelCol("label") + .setPredictionCol("prediction") + .setMetricName("rmse"); +double rmse = evaluator.evaluate(predictions); +System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse); + +GBTRegressionModel gbtModel = + (GBTRegressionModel)(model.stages()[1]); +System.out.println("Learned regression GBT model:\n" + gbtModel.toDebugString()); +{% endhighlight %} +
+ +
+ +Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.GBTRegressor) for more details. + +{% highlight python %} +from pyspark.ml import Pipeline +from pyspark.ml.regression import GBTRegressor +from pyspark.ml.feature import VectorIndexer +from pyspark.ml.evaluation import RegressionEvaluator +from pyspark.mllib.util import MLUtils + +# Load and parse the data file, converting it to a DataFrame. +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +# Automatically identify categorical features, and index them. +# Set maxCategories so features with > 4 distinct values are treated as continuous. +featureIndexer =\ + VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) + +# Split the data into training and test sets (30% held out for testing) +(trainingData, testData) = data.randomSplit([0.7, 0.3]) + +# Train a GBT model. +gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10) + +# Chain indexer and GBT in a Pipeline +pipeline = Pipeline(stages=[featureIndexer, gbt]) + +# Train model. This also runs the indexer. +model = pipeline.fit(trainingData) + +# Make predictions. +predictions = model.transform(testData) + +# Select example rows to display. +predictions.select("prediction", "label", "features").show(5) + +# Select (prediction, true label) and compute test error +evaluator = RegressionEvaluator( + labelCol="label", predictionCol="prediction", metricName="rmse") +rmse = evaluator.evaluate(predictions) +print "Root Mean Squared Error (RMSE) on test data = %g" % rmse + +gbtModel = model.stages[1] +print gbtModel # summary only +{% endhighlight %} +
+
+ + +## One-vs-Rest (a.k.a. One-vs-All) + +[OneVsRest](http://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) is an example of a machine learning reduction for performing multiclass classification given a base classifier that can perform binary classification efficiently. It is also known as "One-vs-All." `OneVsRest` is implemented as an `Estimator`. For the base classifier it takes instances of `Classifier` and creates a binary classification problem for each of the k classes. The classifier for class i is trained to predict whether the label is i or not, distinguishing class i from all other classes. @@ -28,6 +964,9 @@ The example below demonstrates how to load the
+ +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classifier.OneVsRest) for more details. + {% highlight scala %} import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} import org.apache.spark.mllib.evaluation.MulticlassMetrics @@ -64,9 +1003,12 @@ println("label\tfpr\n") } {% endhighlight %}
+
-{% highlight java %} +Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/OneVsRest.html) for more details. + +{% highlight java %} import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.classification.LogisticRegression; @@ -88,7 +1030,7 @@ RDD data = MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_multiclass_classification_data.txt"); DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class); -DataFrame[] splits = dataFrame.randomSplit(new double[]{0.7, 0.3}, 12345); +DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345); DataFrame train = splits[0]; DataFrame test = splits[1]; diff --git a/docs/ml-features.md b/docs/ml-features.md index 54068debe215..90654d1e5a24 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -55,7 +55,7 @@ rescaledData.select("features", "label").take(3).foreach(println)
{% highlight java %} -import com.google.common.collect.Lists; +import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.HashingTF; @@ -70,7 +70,7 @@ import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( +JavaRDD jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(0, "Hi I heard about Spark"), RowFactory.create(0, "I wish Java could use case classes"), RowFactory.create(1, "Logistic regression models are neat") @@ -153,7 +153,7 @@ result.select("result").take(3).foreach(println)
{% highlight java %} -import com.google.common.collect.Lists; +import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -167,10 +167,10 @@ JavaSparkContext jsc = ... SQLContext sqlContext = ... // Input data: Each row is a bag of words from a sentence or document. -JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( - RowFactory.create(Lists.newArrayList("Hi I heard about Spark".split(" "))), - RowFactory.create(Lists.newArrayList("I wish Java could use case classes".split(" "))), - RowFactory.create(Lists.newArrayList("Logistic regression models are neat".split(" "))) +JavaRDD jrdd = jsc.parallelize(Arrays.asList( + RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), + RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), + RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))) )); StructType schema = new StructType(new StructField[]{ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) @@ -211,35 +211,156 @@ for feature in result.select("result").take(3):
+## CountVectorizer + +`CountVectorizer` and `CountVectorizerModel` aim to help convert a collection of text documents + to vectors of token counts. When an a-priori dictionary is not available, `CountVectorizer` can + be used as an `Estimator` to extract the vocabulary and generates a `CountVectorizerModel`. The + model produces sparse representations for the documents over the vocabulary, which can then be + passed to other algorithms like LDA. + + During the fitting process, `CountVectorizer` will select the top `vocabSize` words ordered by + term frequency across the corpus. An optional parameter "minDF" also affect the fitting process + by specifying the minimum number (or fraction if < 1.0) of documents a term must appear in to be + included in the vocabulary. + +**Examples** + +Assume that we have the following DataFrame with columns `id` and `texts`: + +~~~~ + id | texts +----|---------- + 0 | Array("a", "b", "c") + 1 | Array("a", "b", "b", "c", "a") +~~~~ + +each row in`texts` is a document of type Array[String]. +Invoking fit of `CountVectorizer` produces a `CountVectorizerModel` with vocabulary (a, b, c), +then the output column "vector" after transformation contains: + +~~~~ + id | texts | vector +----|---------------------------------|--------------- + 0 | Array("a", "b", "c") | (3,[0,1,2],[1.0,1.0,1.0]) + 1 | Array("a", "b", "b", "c", "a") | (3,[0,1,2],[2.0,2.0,1.0]) +~~~~ + +each vector represents the token counts of the document over the vocabulary. + +
+
+More details can be found in the API docs for +[CountVectorizer](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizer) and +[CountVectorizerModel](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel). +{% highlight scala %} +import org.apache.spark.ml.feature.CountVectorizer +import org.apache.spark.mllib.util.CountVectorizerModel + +val df = sqlContext.createDataFrame(Seq( + (0, Array("a", "b", "c")), + (1, Array("a", "b", "b", "c", "a")) +)).toDF("id", "words") + +// fit a CountVectorizerModel from the corpus +val cvModel: CountVectorizerModel = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setVocabSize(3) + .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary + .fit(df) + +// alternatively, define CountVectorizerModel with a-priori vocabulary +val cvm = new CountVectorizerModel(Array("a", "b", "c")) + .setInputCol("words") + .setOutputCol("features") + +cvModel.transform(df).select("features").show() +{% endhighlight %} +
+ +
+More details can be found in the API docs for +[CountVectorizer](api/java/org/apache/spark/ml/feature/CountVectorizer.html) and +[CountVectorizerModel](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html). +{% highlight java %} +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.CountVectorizer; +import org.apache.spark.ml.feature.CountVectorizerModel; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.*; + +// Input data: Each row is a bag of words from a sentence or document. +JavaRDD jrdd = jsc.parallelize(Arrays.asList( + RowFactory.create(Arrays.asList("a", "b", "c")), + RowFactory.create(Arrays.asList("a", "b", "b", "c", "a")) +)); +StructType schema = new StructType(new StructField [] { + new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) +}); +DataFrame df = sqlContext.createDataFrame(jrdd, schema); + +// fit a CountVectorizerModel from the corpus +CountVectorizerModel cvModel = new CountVectorizer() + .setInputCol("text") + .setOutputCol("feature") + .setVocabSize(3) + .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary + .fit(df); + +// alternatively, define CountVectorizerModel with a-priori vocabulary +CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"}) + .setInputCol("text") + .setOutputCol("feature"); + +cvModel.transform(df).show(); +{% endhighlight %} +
+
+ # Feature Transformers ## Tokenizer [Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple [Tokenizer](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) class provides this functionality. The example below shows how to split sentences into sequences of words. -Note: A more advanced tokenizer is provided via [RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer). +[RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more + advanced tokenization based on regular expression (regex) matching. + By default, the parameter "pattern" (regex, default: \\s+) is used as delimiters to split the input text. + Alternatively, users can set parameter "gaps" to false indicating the regex "pattern" denotes + "tokens" rather than splitting gaps, and find all matching occurrences as the tokenization result.
{% highlight scala %} -import org.apache.spark.ml.feature.Tokenizer +import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer} val sentenceDataFrame = sqlContext.createDataFrame(Seq( (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (1, "I wish Java could use case classes"), + (2, "Logistic,regression,models,are,neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") -val wordsDataFrame = tokenizer.transform(sentenceDataFrame) -wordsDataFrame.select("words", "label").take(3).foreach(println) +val regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) + +val tokenized = tokenizer.transform(sentenceDataFrame) +tokenized.select("words", "label").take(3).foreach(println) +val regexTokenized = regexTokenizer.transform(sentenceDataFrame) +regexTokenized.select("words", "label").take(3).foreach(println) {% endhighlight %}
{% highlight java %} -import com.google.common.collect.Lists; +import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RegexTokenizer; import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.sql.DataFrame; @@ -250,10 +371,10 @@ import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( +JavaRDD jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(0, "Hi I heard about Spark"), - RowFactory.create(0, "I wish Java could use case classes"), - RowFactory.create(1, "Logistic regression models are neat") + RowFactory.create(1, "I wish Java could use case classes"), + RowFactory.create(2, "Logistic,regression,models,are,neat") )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), @@ -267,35 +388,138 @@ for (Row r : wordsDataFrame.select("words", "label").take(3)) { for (String word : words) System.out.print(word + " "); System.out.println(); } + +RegexTokenizer regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); {% endhighlight %}
{% highlight python %} -from pyspark.ml.feature import Tokenizer +from pyspark.ml.feature import Tokenizer, RegexTokenizer sentenceDataFrame = sqlContext.createDataFrame([ (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (1, "I wish Java could use case classes"), + (2, "Logistic,regression,models,are,neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsDataFrame = tokenizer.transform(sentenceDataFrame) for words_label in wordsDataFrame.select("words", "label").take(3): print(words_label) +regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") +# alternatively, pattern="\\w+", gaps(False) {% endhighlight %}
+## StopWordsRemover +[Stop words](https://en.wikipedia.org/wiki/Stop_words) are words which +should be excluded from the input, typically because the words appear +frequently and don't carry as much meaning. -## $n$-gram +`StopWordsRemover` takes as input a sequence of strings (e.g. the output +of a [Tokenizer](ml-features.html#tokenizer)) and drops all the stop +words from the input sequences. The list of stopwords is specified by +the `stopWords` parameter. We provide [a list of stop +words](http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words) by +default, accessible by calling `getStopWords` on a newly instantiated +`StopWordsRemover` instance. -An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (typically words) for some integer $n$. The `NGram` class can be used to transform input features into $n$-grams. +**Examples** -`NGram` takes as input a sequence of strings (e.g. the output of a [Tokenizer](ml-features.html#tokenizer). The parameter `n` is used to determine the number of terms in each $n$-gram. The output will consist of a sequence of $n$-grams where each $n$-gram is represented by a space-delimited string of $n$ consecutive words. If the input sequence contains fewer than `n` strings, no output is produced. +Assume that we have the following DataFrame with columns `id` and `raw`: + +~~~~ + id | raw +----|---------- + 0 | [I, saw, the, red, baloon] + 1 | [Mary, had, a, little, lamb] +~~~~ + +Applying `StopWordsRemover` with `raw` as the input column and `filtered` as the output +column, we should get the following: + +~~~~ + id | raw | filtered +----|-----------------------------|-------------------- + 0 | [I, saw, the, red, baloon] | [saw, red, baloon] + 1 | [Mary, had, a, little, lamb]|[Mary, little, lamb] +~~~~ + +In `filtered`, the stop words "I", "the", "had", and "a" have been +filtered out.
+
+ +[`StopWordsRemover`](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover) +takes an input column name, an output column name, a list of stop words, +and a boolean indicating if the matches should be case sensitive (false +by default). + +{% highlight scala %} +import org.apache.spark.ml.feature.StopWordsRemover + +val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") +val dataSet = sqlContext.createDataFrame(Seq( + (0, Seq("I", "saw", "the", "red", "baloon")), + (1, Seq("Mary", "had", "a", "little", "lamb")) +)).toDF("id", "raw") + +remover.transform(dataSet).show() +{% endhighlight %} +
+ +
+ +[`StopWordsRemover`](api/java/org/apache/spark/ml/feature/StopWordsRemover.html) +takes an input column name, an output column name, a list of stop words, +and a boolean indicating if the matches should be case sensitive (false +by default). + +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.StopWordsRemover; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +StopWordsRemover remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered"); + +JavaRDD rdd = jsc.parallelize(Arrays.asList( + RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), + RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) +)); +StructType schema = new StructType(new StructField[] { + new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) +}); +DataFrame dataset = jsql.createDataFrame(rdd, schema); + +remover.transform(dataset).show(); +{% endhighlight %} +
+
+ +## $n$-gram + +An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (typically words) for some integer $n$. The `NGram` class can be used to transform input features into $n$-grams. + +`NGram` takes as input a sequence of strings (e.g. the output of a [Tokenizer](ml-features.html#tokenizer)). The parameter `n` is used to determine the number of terms in each $n$-gram. The output will consist of a sequence of $n$-grams where each $n$-gram is represented by a space-delimited string of $n$ consecutive words. If the input sequence contains fewer than `n` strings, no output is produced. +
@@ -322,7 +546,7 @@ ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(pri [`NGram`](api/java/org/apache/spark/ml/feature/NGram.html) takes an input column name, an output column name, and an optional length parameter n (n=2 by default). {% highlight java %} -import com.google.common.collect.Lists; +import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.NGram; @@ -335,10 +559,10 @@ import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( - RowFactory.create(0D, Lists.newArrayList("Hi", "I", "heard", "about", "Spark")), - RowFactory.create(1D, Lists.newArrayList("I", "wish", "Java", "could", "use", "case", "classes")), - RowFactory.create(2D, Lists.newArrayList("Logistic", "regression", "models", "are", "neat")) +JavaRDD jrdd = jsc.parallelize(Arrays.asList( + RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")), + RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")), + RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat")) )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), @@ -408,7 +632,7 @@ binarizedFeatures.collect().foreach(println)
{% highlight java %} -import com.google.common.collect.Lists; +import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.Binarizer; @@ -420,7 +644,7 @@ import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( +JavaRDD jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(0, 0.1), RowFactory.create(1, 0.8), RowFactory.create(2, 0.2) @@ -461,6 +685,92 @@ for binarized_feature, in binarizedFeatures.collect():
+## PCA + +[PCA](http://en.wikipedia.org/wiki/Principal_component_analysis) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. A [PCA](api/scala/index.html#org.apache.spark.ml.feature.PCA) class trains a model to project vectors to a low-dimensional space using PCA. The example below shows how to project 5-dimensional feature vectors into 3-dimensional principal components. + +
+
+See the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.feature.PCA) for API details. +{% highlight scala %} +import org.apache.spark.ml.feature.PCA +import org.apache.spark.mllib.linalg.Vectors + +val data = Array( + Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), + Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), + Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) +) +val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") +val pca = new PCA() + .setInputCol("features") + .setOutputCol("pcaFeatures") + .setK(3) + .fit(df) +val pcaDF = pca.transform(df) +val result = pcaDF.select("pcaFeatures") +result.show() +{% endhighlight %} +
+ +
+See the [Java API documentation](api/java/org/apache/spark/ml/feature/PCA.html) for API details. +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.feature.PCA +import org.apache.spark.ml.feature.PCAModel +import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +JavaSparkContext jsc = ... +SQLContext jsql = ... +JavaRDD data = jsc.parallelize(Arrays.asList( + RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})), + RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)), + RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) +)); +StructType schema = new StructType(new StructField[] { + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); +DataFrame df = jsql.createDataFrame(data, schema); +PCAModel pca = new PCA() + .setInputCol("features") + .setOutputCol("pcaFeatures") + .setK(3) + .fit(df); +DataFrame result = pca.transform(df).select("pcaFeatures"); +result.show(); +{% endhighlight %} +
+ +
+See the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.feature.PCA) for API details. +{% highlight python %} +from pyspark.ml.feature import PCA +from pyspark.mllib.linalg import Vectors + +data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), + (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), + (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] +df = sqlContext.createDataFrame(data,["features"]) +pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") +model = pca.fit(df) +result = model.transform(df).select("pcaFeatures") +result.show(truncate=False) +{% endhighlight %} +
+
+ ## PolynomialExpansion [Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_expansion) is the process of expanding your features into a polynomial space, which is formulated by an n-degree combination of original dimensions. A [PolynomialExpansion](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion) class provides this functionality. The example below shows how to expand your features into a 3-degree polynomial space. @@ -488,7 +798,7 @@ polyDF.select("polyFeatures").take(3).foreach(println)
{% highlight java %} -import com.google.common.collect.Lists; +import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -509,7 +819,7 @@ PolynomialExpansion polyExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3); -JavaRDD data = jsc.parallelize(Lists.newArrayList( +JavaRDD data = jsc.parallelize(Arrays.asList( RowFactory.create(Vectors.dense(-2.0, 2.3)), RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(0.6, -1.1)) @@ -544,12 +854,87 @@ for expanded in polyDF.select("polyFeatures").take(3):
+## Discrete Cosine Transform (DCT) + +The [Discrete Cosine +Transform](https://en.wikipedia.org/wiki/Discrete_cosine_transform) +transforms a length $N$ real-valued sequence in the time domain into +another length $N$ real-valued sequence in the frequency domain. A +[DCT](api/scala/index.html#org.apache.spark.ml.feature.DCT) class +provides this functionality, implementing the +[DCT-II](https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II) +and scaling the result by $1/\sqrt{2}$ such that the representing matrix +for the transform is unitary. No shift is applied to the transformed +sequence (e.g. the $0$th element of the transformed sequence is the +$0$th DCT coefficient and _not_ the $N/2$th). + +
+
+{% highlight scala %} +import org.apache.spark.ml.feature.DCT +import org.apache.spark.mllib.linalg.Vectors + +val data = Seq( + Vectors.dense(0.0, 1.0, -2.0, 3.0), + Vectors.dense(-1.0, 2.0, 4.0, -7.0), + Vectors.dense(14.0, -2.0, -5.0, 1.0)) +val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") +val dct = new DCT() + .setInputCol("features") + .setOutputCol("featuresDCT") + .setInverse(false) +val dctDf = dct.transform(df) +dctDf.select("featuresDCT").show(3) +{% endhighlight %} +
+ +
+{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.feature.DCT; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +JavaRDD data = jsc.parallelize(Arrays.asList( + RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)), + RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)), + RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0)) +)); +StructType schema = new StructType(new StructField[] { + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); +DataFrame df = jsql.createDataFrame(data, schema); +DCT dct = new DCT() + .setInputCol("features") + .setOutputCol("featuresDCT") + .setInverse(false); +DataFrame dctDf = dct.transform(df); +dctDf.select("featuresDCT").show(3); +{% endhighlight %} +
+
+ ## StringIndexer `StringIndexer` encodes a string column of labels to a column of label indices. The indices are in `[0, numLabels)`, ordered by label frequencies. So the most frequent label gets index `0`. -If the input column is numeric, we cast it to string and index the string values. +If the input column is numeric, we cast it to string and index the string +values. When downstream pipeline components such as `Estimator` or +`Transformer` make use of this string-indexed label, you must set the input +column of the component to this string-indexed column name. In many cases, +you can set the input column with `setInputCol`. **Examples** @@ -693,7 +1078,7 @@ encoded.select("id", "categoryVec").foreach(println)
{% highlight java %} -import com.google.common.collect.Lists; +import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.OneHotEncoder; @@ -707,7 +1092,7 @@ import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( +JavaRDD jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(0, "a"), RowFactory.create(1, "b"), RowFactory.create(2, "c"), @@ -953,6 +1338,7 @@ val scaledData = scalerModel.transform(dataFrame) {% highlight java %} import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.StandardScaler; +import org.apache.spark.ml.feature.StandardScalerModel; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.sql.DataFrame; @@ -993,6 +1379,76 @@ scaledData = scalerModel.transform(dataFrame)
+## MinMaxScaler + +`MinMaxScaler` transforms a dataset of `Vector` rows, rescaling each feature to a specific range (often [0, 1]). It takes parameters: + +* `min`: 0.0 by default. Lower bound after transformation, shared by all features. +* `max`: 1.0 by default. Upper bound after transformation, shared by all features. + +`MinMaxScaler` computes summary statistics on a data set and produces a `MinMaxScalerModel`. The model can then transform each feature individually such that it is in the given range. + +The rescaled value for a feature E is calculated as, +`\begin{equation} + Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min +\end{equation}` +For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)` + +Note that since zero values will probably be transformed to non-zero values, output of the transformer will be DenseVector even for sparse input. + +The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [0, 1]. + +
+
+More details can be found in the API docs for +[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and +[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). +{% highlight scala %} +import org.apache.spark.ml.feature.MinMaxScaler +import org.apache.spark.mllib.util.MLUtils + +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") +val dataFrame = sqlContext.createDataFrame(data) +val scaler = new MinMaxScaler() + .setInputCol("features") + .setOutputCol("scaledFeatures") + +// Compute summary statistics and generate MinMaxScalerModel +val scalerModel = scaler.fit(dataFrame) + +// rescale each feature to range [min, max]. +val scaledData = scalerModel.transform(dataFrame) +{% endhighlight %} +
+ +
+More details can be found in the API docs for +[MinMaxScaler](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) and +[MinMaxScalerModel](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html). +{% highlight java %} +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.MinMaxScaler; +import org.apache.spark.ml.feature.MinMaxScalerModel; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.sql.DataFrame; + +JavaRDD data = + MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); +DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class); +MinMaxScaler scaler = new MinMaxScaler() + .setInputCol("features") + .setOutputCol("scaledFeatures"); + +// Compute summary statistics and generate MinMaxScalerModel +MinMaxScalerModel scalerModel = scaler.fit(dataFrame); + +// rescale each feature to range [min, max]. +DataFrame scaledData = scalerModel.transform(dataFrame); +{% endhighlight %} +
+
+ ## Bucketizer `Bucketizer` transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users. It takes a parameter: @@ -1030,7 +1486,7 @@ val bucketedData = bucketizer.transform(dataFrame)
{% highlight java %} -import com.google.common.collect.Lists; +import java.util.Arrays; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; @@ -1042,7 +1498,7 @@ import org.apache.spark.sql.types.StructType; double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY}; -JavaRDD data = jsc.parallelize(Lists.newArrayList( +JavaRDD data = jsc.parallelize(Arrays.asList( RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), @@ -1107,7 +1563,7 @@ v_N This example below demonstrates how to transform vectors using a transforming vector value.
-
+
{% highlight scala %} import org.apache.spark.ml.feature.ElementwiseProduct import org.apache.spark.mllib.linalg.Vectors @@ -1124,14 +1580,14 @@ val transformer = new ElementwiseProduct() .setOutputCol("transformedVector") // Batch transform the vectors to create new column: -val transformedData = transformer.transform(dataFrame) +transformer.transform(dataFrame).show() {% endhighlight %}
-
+
{% highlight java %} -import com.google.common.collect.Lists; +import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.ElementwiseProduct; @@ -1147,7 +1603,7 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; // Create some vector data; also works for sparse vectors -JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( +JavaRDD jrdd = jsc.parallelize(Arrays.asList( RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)), RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0)) )); @@ -1162,10 +1618,25 @@ ElementwiseProduct transformer = new ElementwiseProduct() .setInputCol("vector") .setOutputCol("transformedVector"); // Batch transform the vectors to create new column: -DataFrame transformedData = transformer.transform(dataFrame); +transformer.transform(dataFrame).show(); + +{% endhighlight %} +
+ +
+{% highlight python %} +from pyspark.ml.feature import ElementwiseProduct +from pyspark.mllib.linalg import Vectors + +data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)] +df = sqlContext.createDataFrame(data, ["vector"]) +transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), + inputCol="vector", outputCol="transformedVector") +transformer.transform(df).show() {% endhighlight %}
+
## VectorAssembler @@ -1284,3 +1755,242 @@ print(output.select("features", "clicked").first()) # Feature Selectors +## VectorSlicer + +`VectorSlicer` is a transformer that takes a feature vector and outputs a new feature vector with a +sub-array of the original features. It is useful for extracting features from a vector column. + +`VectorSlicer` accepts a vector column with a specified indices, then outputs a new vector column +whose values are selected via those indices. There are two types of indices, + + 1. Integer indices that represents the indices into the vector, `setIndices()`; + + 2. String indices that represents the names of features into the vector, `setNames()`. + *This requires the vector column to have an `AttributeGroup` since the implementation matches on + the name field of an `Attribute`.* + +Specification by integer and string are both acceptable. Moreover, you can use integer index and +string name simultaneously. At least one feature must be selected. Duplicate features are not +allowed, so there can be no overlap between selected indices and names. Note that if names of +features are selected, an exception will be threw out when encountering with empty input attributes. + +The output vector will order features with the selected indices first (in the order given), +followed by the selected names (in the order given). + +**Examples** + +Suppose that we have a DataFrame with the column `userFeatures`: + +~~~ + userFeatures +------------------ + [0.0, 10.0, 0.5] +~~~ + +`userFeatures` is a vector column that contains three user features. Assuming that the first column +of `userFeatures` are all zeros, so we want to remove it and only the last two columns are selected. +The `VectorSlicer` selects the last two elements with `setIndices(1, 2)` then produces a new vector +column named `features`: + +~~~ + userFeatures | features +------------------|----------------------------- + [0.0, 10.0, 0.5] | [10.0, 0.5] +~~~ + +Suppose also that we have a potential input attributes for the `userFeatures`, i.e. +`["f1", "f2", "f3"]`, then we can use `setNames("f2", "f3")` to select them. + +~~~ + userFeatures | features +------------------|----------------------------- + [0.0, 10.0, 0.5] | [10.0, 0.5] + ["f1", "f2", "f3"] | ["f2", "f3"] +~~~ + +
+
+ +[`VectorSlicer`](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer) takes an input +column name with specified indices or names and an output column name. + +{% highlight scala %} +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} +import org.apache.spark.ml.feature.VectorSlicer +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, Row, SQLContext} + +val data = Array( + Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), + Vectors.dense(-2.0, 2.3, 0.0) +) + +val defaultAttr = NumericAttribute.defaultAttr +val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) +val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) + +val dataRDD = sc.parallelize(data).map(Row.apply) +val dataset = sqlContext.createDataFrame(dataRDD, StructType(attrGroup.toStructField())) + +val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") + +slicer.setIndices(1).setNames("f3") +// or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) + +val output = slicer.transform(dataset) +println(output.select("userFeatures", "features").first()) +{% endhighlight %} +
+ +
+ +[`VectorSlicer`](api/java/org/apache/spark/ml/feature/VectorSlicer.html) takes an input column name +with specified indices or names and an output column name. + +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.*; +import static org.apache.spark.sql.types.DataTypes.*; + +Attribute[] attrs = new Attribute[]{ + NumericAttribute.defaultAttr().withName("f1"), + NumericAttribute.defaultAttr().withName("f2"), + NumericAttribute.defaultAttr().withName("f3") +}; +AttributeGroup group = new AttributeGroup("userFeatures", attrs); + +JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( + RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), + RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) +)); + +DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField())); + +VectorSlicer vectorSlicer = new VectorSlicer() + .setInputCol("userFeatures").setOutputCol("features"); + +vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); +// or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"}) + +DataFrame output = vectorSlicer.transform(dataset); + +System.out.println(output.select("userFeatures", "features").first()); +{% endhighlight %} +
+
+ +## RFormula + +`RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a double column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula. + +**Examples** + +Assume that we have a DataFrame with the columns `id`, `country`, `hour`, and `clicked`: + +~~~ +id | country | hour | clicked +---|---------|------|--------- + 7 | "US" | 18 | 1.0 + 8 | "CA" | 12 | 0.0 + 9 | "NZ" | 15 | 0.0 +~~~ + +If we use `RFormula` with a formula string of `clicked ~ country + hour`, which indicates that we want to +predict `clicked` based on `country` and `hour`, after transformation we should get the following DataFrame: + +~~~ +id | country | hour | clicked | features | label +---|---------|------|---------|------------------|------- + 7 | "US" | 18 | 1.0 | [0.0, 0.0, 18.0] | 1.0 + 8 | "CA" | 12 | 0.0 | [0.0, 1.0, 12.0] | 0.0 + 9 | "NZ" | 15 | 0.0 | [1.0, 0.0, 15.0] | 0.0 +~~~ + +
+
+ +[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns. + +{% highlight scala %} +import org.apache.spark.ml.feature.RFormula + +val dataset = sqlContext.createDataFrame(Seq( + (7, "US", 18, 1.0), + (8, "CA", 12, 0.0), + (9, "NZ", 15, 0.0) +)).toDF("id", "country", "hour", "clicked") +val formula = new RFormula() + .setFormula("clicked ~ country + hour") + .setFeaturesCol("features") + .setLabelCol("label") +val output = formula.fit(dataset).transform(dataset) +output.select("features", "label").show() +{% endhighlight %} +
+ +
+ +[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an R formula string, and optional parameters for the names of its output columns. + +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RFormula; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.*; +import static org.apache.spark.sql.types.DataTypes.*; + +StructType schema = createStructType(new StructField[] { + createStructField("id", IntegerType, false), + createStructField("country", StringType, false), + createStructField("hour", IntegerType, false), + createStructField("clicked", DoubleType, false) +}); +JavaRDD rdd = jsc.parallelize(Arrays.asList( + RowFactory.create(7, "US", 18, 1.0), + RowFactory.create(8, "CA", 12, 0.0), + RowFactory.create(9, "NZ", 15, 0.0) +)); +DataFrame dataset = sqlContext.createDataFrame(rdd, schema); + +RFormula formula = new RFormula() + .setFormula("clicked ~ country + hour") + .setFeaturesCol("features") + .setLabelCol("label"); + +DataFrame output = formula.fit(dataset).transform(dataset); +output.select("features", "label").show(); +{% endhighlight %} +
+ +
+ +[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns. + +{% highlight python %} +from pyspark.ml.feature import RFormula + +dataset = sqlContext.createDataFrame( + [(7, "US", 18, 1.0), + (8, "CA", 12, 0.0), + (9, "NZ", 15, 0.0)], + ["id", "country", "hour", "clicked"]) +formula = RFormula( + formula="clicked ~ country + hour", + featuresCol="features", + labelCol="label") +output = formula.fit(dataset).transform(dataset) +output.select("features", "label").show() +{% endhighlight %} +
+
diff --git a/docs/ml-guide.md b/docs/ml-guide.md index b6ca50e98db0..78c93a95c780 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -21,75 +21,77 @@ title: Spark ML Programming Guide \]` -Spark 1.2 introduced a new package called `spark.ml`, which aims to provide a uniform set of -high-level APIs that help users create and tune practical machine learning pipelines. +The `spark.ml` package aims to provide a uniform set of high-level APIs built on top of +[DataFrames](sql-programming-guide.html#dataframes) that help users create and tune practical +machine learning pipelines. +See the [algorithm guides](#algorithm-guides) section below for guides on sub-packages of +`spark.ml`, including feature transformers unique to the Pipelines API, ensembles, and more. -*Graduated from Alpha!* The Pipelines API is no longer an alpha component, although many elements of it are still `Experimental` or `DeveloperApi`. - -Note that we will keep supporting and adding features to `spark.mllib` along with the -development of `spark.ml`. -Users should be comfortable using `spark.mllib` features and expect more features coming. -Developers should contribute new algorithms to `spark.mllib` and can optionally contribute -to `spark.ml`. - -Guides for sub-packages of `spark.ml` include: - -* [Feature Extraction, Transformation, and Selection](ml-features.html): Details on transformers supported in the Pipelines API, including a few not in the lower-level `spark.mllib` API -* [Ensembles](ml-ensembles.html): Details on ensemble learning methods in the Pipelines API - - -**Table of Contents** +**Table of contents** * This will become a table of contents (this text will be scraped). {:toc} -# Main Concepts +# Main concepts -Spark ML standardizes APIs for machine learning algorithms to make it easier to combine multiple algorithms into a single pipeline, or workflow. This section covers the key concepts introduced by the Spark ML API. +Spark ML standardizes APIs for machine learning algorithms to make it easier to combine multiple +algorithms into a single pipeline, or workflow. +This section covers the key concepts introduced by the Spark ML API, where the pipeline concept is +mostly inspired by the [scikit-learn](http://scikit-learn.org/) project. -* **[ML Dataset](ml-guide.html#ml-dataset)**: Spark ML uses the [`DataFrame`](api/scala/index.html#org.apache.spark.sql.DataFrame) from Spark SQL as a dataset which can hold a variety of data types. -E.g., a dataset could have different columns storing text, feature vectors, true labels, and predictions. +* **[`DataFrame`](ml-guide.html#dataframe)**: Spark ML uses `DataFrame` from Spark SQL as an ML + dataset, which can hold a variety of data types. + E.g., a `DataFrame` could have different columns storing text, feature vectors, true labels, and predictions. * **[`Transformer`](ml-guide.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`. -E.g., an ML model is a `Transformer` which transforms an RDD with features into an RDD with predictions. +E.g., an ML model is a `Transformer` which transforms `DataFrame` with features into a `DataFrame` with predictions. * **[`Estimator`](ml-guide.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`. -E.g., a learning algorithm is an `Estimator` which trains on a dataset and produces a model. +E.g., a learning algorithm is an `Estimator` which trains on a `DataFrame` and produces a model. * **[`Pipeline`](ml-guide.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow. -* **[`Param`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters. +* **[`Parameter`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters. -## ML Dataset +## DataFrame Machine learning can be applied to a wide variety of data types, such as vectors, text, images, and structured data. -Spark ML adopts the [`DataFrame`](api/scala/index.html#org.apache.spark.sql.DataFrame) from Spark SQL in order to support a variety of data types under a unified Dataset concept. +Spark ML adopts the `DataFrame` from Spark SQL in order to support a variety of data types. `DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#spark-sql-datatype-reference) for a list of supported types. -In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) types. +In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](mllib-data-types.html#local-vector) types. A `DataFrame` can be created either implicitly or explicitly from a regular `RDD`. See the code examples below and the [Spark SQL programming guide](sql-programming-guide.html) for examples. Columns in a `DataFrame` are named. The code examples below use names such as "text," "features," and "label." -## ML Algorithms +## Pipeline components ### Transformers -A [`Transformer`](api/scala/index.html#org.apache.spark.ml.Transformer) is an abstraction which includes feature transformers and learned models. Technically, a `Transformer` implements a method `transform()` which converts one `DataFrame` into another, generally by appending one or more columns. +A `Transformer` is an abstraction that includes feature transformers and learned models. +Technically, a `Transformer` implements a method `transform()`, which converts one `DataFrame` into +another, generally by appending one or more columns. For example: -* A feature transformer might take a dataset, read a column (e.g., text), convert it into a new column (e.g., feature vectors), append the new column to the dataset, and output the updated dataset. -* A learning model might take a dataset, read the column containing feature vectors, predict the label for each feature vector, append the labels as a new column, and output the updated dataset. +* A feature transformer might take a `DataFrame`, read a column (e.g., text), map it into a new + column (e.g., feature vectors), and output a new `DataFrame` with the mapped column appended. +* A learning model might take a `DataFrame`, read the column containing feature vectors, predict the + label for each feature vector, and output a new `DataFrame` with predicted labels appended as a + column. ### Estimators -An [`Estimator`](api/scala/index.html#org.apache.spark.ml.Estimator) abstracts the concept of a learning algorithm or any algorithm which fits or trains on data. Technically, an `Estimator` implements a method `fit()` which accepts a `DataFrame` and produces a `Transformer`. -For example, a learning algorithm such as `LogisticRegression` is an `Estimator`, and calling `fit()` trains a `LogisticRegressionModel`, which is a `Transformer`. +An `Estimator` abstracts the concept of a learning algorithm or any algorithm that fits or trains on +data. +Technically, an `Estimator` implements a method `fit()`, which accepts a `DataFrame` and produces a +`Model`, which is a `Transformer`. +For example, a learning algorithm such as `LogisticRegression` is an `Estimator`, and calling +`fit()` trains a `LogisticRegressionModel`, which is a `Model` and hence a `Transformer`. -### Properties of ML Algorithms +### Properties of pipeline components -`Transformer`s and `Estimator`s are both stateless. In the future, stateful algorithms may be supported via alternative concepts. +`Transformer.transform()`s and `Estimator.fit()`s are both stateless. In the future, stateful algorithms may be supported via alternative concepts. Each instance of a `Transformer` or `Estimator` has a unique ID, which is useful in specifying parameters (discussed below). @@ -102,15 +104,16 @@ E.g., a simple text document processing workflow might include several stages: * Convert each document's words into a numerical feature vector. * Learn a prediction model using the feature vectors and labels. -Spark ML represents such a workflow as a [`Pipeline`](api/scala/index.html#org.apache.spark.ml.Pipeline), -which consists of a sequence of [`PipelineStage`s](api/scala/index.html#org.apache.spark.ml.PipelineStage) (`Transformer`s and `Estimator`s) to be run in a specific order. We will use this simple workflow as a running example in this section. +Spark ML represents such a workflow as a `Pipeline`, which consists of a sequence of +`PipelineStage`s (`Transformer`s and `Estimator`s) to be run in a specific order. +We will use this simple workflow as a running example in this section. -### How It Works +### How it works A `Pipeline` is specified as a sequence of stages, and each stage is either a `Transformer` or an `Estimator`. -These stages are run in order, and the input dataset is modified as it passes through each stage. -For `Transformer` stages, the `transform()` method is called on the dataset. -For `Estimator` stages, the `fit()` method is called to produce a `Transformer` (which becomes part of the `PipelineModel`, or fitted `Pipeline`), and that `Transformer`'s `transform()` method is called on the dataset. +These stages are run in order, and the input `DataFrame` is transformed as it passes through each stage. +For `Transformer` stages, the `transform()` method is called on the `DataFrame`. +For `Estimator` stages, the `fit()` method is called to produce a `Transformer` (which becomes part of the `PipelineModel`, or fitted `Pipeline`), and that `Transformer`'s `transform()` method is called on the `DataFrame`. We illustrate this for the simple text document workflow. The figure below is for the *training time* usage of a `Pipeline`. @@ -126,14 +129,17 @@ We illustrate this for the simple text document workflow. The figure below is f Above, the top row represents a `Pipeline` with three stages. The first two (`Tokenizer` and `HashingTF`) are `Transformer`s (blue), and the third (`LogisticRegression`) is an `Estimator` (red). The bottom row represents data flowing through the pipeline, where cylinders indicate `DataFrame`s. -The `Pipeline.fit()` method is called on the original dataset which has raw text documents and labels. -The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words into the dataset. -The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the dataset. +The `Pipeline.fit()` method is called on the original `DataFrame`, which has raw text documents and labels. +The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words to the `DataFrame`. +The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the `DataFrame`. Now, since `LogisticRegression` is an `Estimator`, the `Pipeline` first calls `LogisticRegression.fit()` to produce a `LogisticRegressionModel`. -If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()` method on the dataset before passing the dataset to the next stage. +If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()` +method on the `DataFrame` before passing the `DataFrame` to the next stage. A `Pipeline` is an `Estimator`. -Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel` which is a `Transformer`. This `PipelineModel` is used at *test time*; the figure below illustrates this usage. +Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel`, which is a +`Transformer`. +This `PipelineModel` is used at *test time*; the figure below illustrates this usage.

In the figure above, the `PipelineModel` has the same number of stages as the original `Pipeline`, but all `Estimator`s in the original `Pipeline` have become `Transformer`s. -When the `PipelineModel`'s `transform()` method is called on a test dataset, the data are passed through the `Pipeline` in order. +When the `PipelineModel`'s `transform()` method is called on a test dataset, the data are passed +through the fitted pipeline in order. Each stage's `transform()` method updates the dataset and passes it to the next stage. `Pipeline`s and `PipelineModel`s help to ensure that training and test data go through identical feature processing steps. @@ -154,41 +161,48 @@ Each stage's `transform()` method updates the dataset and passes it to the next *DAG `Pipeline`s*: A `Pipeline`'s stages are specified as an ordered array. The examples given here are all for linear `Pipeline`s, i.e., `Pipeline`s in which each stage uses data produced by the previous stage. It is possible to create non-linear `Pipeline`s as long as the data flow graph forms a Directed Acyclic Graph (DAG). This graph is currently specified implicitly based on the input and output column names of each stage (generally specified as parameters). If the `Pipeline` forms a DAG, then the stages must be specified in topological order. -*Runtime checking*: Since `Pipeline`s can operate on datasets with varied types, they cannot use compile-time type checking. `Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`. This type checking is done using the dataset *schema*, a description of the data types of columns in the `DataFrame`. +*Runtime checking*: Since `Pipeline`s can operate on `DataFrame`s with varied types, they cannot use +compile-time type checking. +`Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`. +This type checking is done using the `DataFrame` *schema*, a description of the data types of columns in the `DataFrame`. ## Parameters Spark ML `Estimator`s and `Transformer`s use a uniform API for specifying parameters. -A [`Param`](api/scala/index.html#org.apache.spark.ml.param.Param) is a named parameter with self-contained documentation. -A [`ParamMap`](api/scala/index.html#org.apache.spark.ml.param.ParamMap) is a set of (parameter, value) pairs. +A `Param` is a named parameter with self-contained documentation. +A `ParamMap` is a set of (parameter, value) pairs. There are two main ways to pass parameters to an algorithm: -1. Set parameters for an instance. E.g., if `lr` is an instance of `LogisticRegression`, one could call `lr.setMaxIter(10)` to make `lr.fit()` use at most 10 iterations. This API resembles the API used in MLlib. +1. Set parameters for an instance. E.g., if `lr` is an instance of `LogisticRegression`, one could + call `lr.setMaxIter(10)` to make `lr.fit()` use at most 10 iterations. + This API resembles the API used in `spark.mllib` package. 2. Pass a `ParamMap` to `fit()` or `transform()`. Any parameters in the `ParamMap` will override parameters previously specified via setter methods. Parameters belong to specific instances of `Estimator`s and `Transformer`s. For example, if we have two `LogisticRegression` instances `lr1` and `lr2`, then we can build a `ParamMap` with both `maxIter` parameters specified: `ParamMap(lr1.maxIter -> 10, lr2.maxIter -> 20)`. This is useful if there are two algorithms with the `maxIter` parameter in a `Pipeline`. -# Algorithm Guides - -There are now several algorithms in the Pipelines API which are not in the lower-level MLlib API, so we link to documentation for them here. These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines, and ensembles, which fit naturally into the `Estimator` abstraction in the Pipelines. +# Algorithm guides -**Pipelines API Algorithm Guides** +There are now several algorithms in the Pipelines API which are not in the `spark.mllib` API, so we link to documentation for them here. These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines, and ensembles, which fit naturally into the `Estimator` abstraction in the Pipelines. -* [Feature Extraction, Transformation, and Selection](ml-features.html) +* [Feature extraction, transformation, and selection](ml-features.html) +* [Decision Trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) - -**Algorithms in `spark.ml`** - * [Linear methods with elastic net regularization](ml-linear-methods.html) +* [Multilayer perceptron classifier](ml-ann.html) -# Code Examples +# Code examples This section gives code examples illustrating the functionality discussed above. -There is not yet documentation for specific algorithms in Spark ML. For more info, please refer to the [API Documentation](api/scala/index.html#org.apache.spark.ml.package). Spark ML algorithms are currently wrappers for MLlib algorithms, and the [MLlib programming guide](mllib-guide.html) has details on specific algorithms. +For more info, please refer to the API documentation +([Scala](api/scala/index.html#org.apache.spark.ml.package), +[Java](api/java/org/apache/spark/ml/package-summary.html), +and [Python](api/python/pyspark.ml.html)). +Some Spark ML algorithms are wrappers for `spark.mllib` algorithms, and the +[MLlib programming guide](mllib-guide.html) has details on specific algorithms. ## Example: Estimator, Transformer, and Param @@ -198,26 +212,18 @@ This example covers the concepts of `Estimator`, `Transformer`, and `Param`.

{% highlight scala %} -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.param.ParamMap import org.apache.spark.mllib.linalg.{Vector, Vectors} -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.Row -val conf = new SparkConf().setAppName("SimpleParamsExample") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) -import sqlContext.implicits._ - -// Prepare training data. -// We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes -// into DataFrames, where it uses the case class metadata to infer the schema. -val training = sc.parallelize(Seq( - LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), - LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), - LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), - LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)))) +// Prepare training data from a list of (label, features) tuples. +val training = sqlContext.createDataFrame(Seq( + (1.0, Vectors.dense(0.0, 1.1, 0.1)), + (0.0, Vectors.dense(2.0, 1.0, -1.0)), + (0.0, Vectors.dense(2.0, 1.3, 1.0)), + (1.0, Vectors.dense(0.0, 1.2, -0.5)) +)).toDF("label", "features") // Create a LogisticRegression instance. This instance is an Estimator. val lr = new LogisticRegression() @@ -229,7 +235,7 @@ lr.setMaxIter(10) .setRegParam(0.01) // Learn a LogisticRegression model. This uses the parameters stored in lr. -val model1 = lr.fit(training.toDF) +val model1 = lr.fit(training) // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). // This prints the parameter (name: value) pairs, where names are unique IDs for this @@ -239,8 +245,8 @@ println("Model 1 was fit using parameters: " + model1.parent.extractParamMap) // We may alternatively specify parameters using a ParamMap, // which supports several methods for specifying parameters. val paramMap = ParamMap(lr.maxIter -> 20) -paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. -paramMap.put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params. + .put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. + .put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params. // One can also combine ParamMaps. val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name @@ -248,58 +254,52 @@ val paramMapCombined = paramMap ++ paramMap2 // Now learn a new model using the paramMapCombined parameters. // paramMapCombined overrides all parameters set earlier via lr.set* methods. -val model2 = lr.fit(training.toDF, paramMapCombined) +val model2 = lr.fit(training, paramMapCombined) println("Model 2 was fit using parameters: " + model2.parent.extractParamMap) // Prepare test data. -val test = sc.parallelize(Seq( - LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), - LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), - LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)))) +val test = sqlContext.createDataFrame(Seq( + (1.0, Vectors.dense(-1.0, 1.5, 1.3)), + (0.0, Vectors.dense(3.0, 2.0, -0.1)), + (1.0, Vectors.dense(0.0, 2.2, -1.5)) +)).toDF("label", "features") // Make predictions on test data using the Transformer.transform() method. // LogisticRegression.transform will only use the 'features' column. // Note that model2.transform() outputs a 'myProbability' column instead of the usual // 'probability' column since we renamed the lr.probabilityCol parameter previously. -model2.transform(test.toDF) +model2.transform(test) .select("features", "label", "myProbability", "prediction") .collect() .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) => println(s"($features, $label) -> prob=$prob, prediction=$prediction") } -sc.stop() {% endhighlight %}
{% highlight java %} +import java.util.Arrays; import java.util.List; -import com.google.common.collect.Lists; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; + import org.apache.spark.ml.classification.LogisticRegressionModel; import org.apache.spark.ml.param.ParamMap; import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.DataFrame; -import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.Row; -SparkConf conf = new SparkConf().setAppName("JavaSimpleParamsExample"); -JavaSparkContext jsc = new JavaSparkContext(conf); -SQLContext jsql = new SQLContext(jsc); - // Prepare training data. // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans // into DataFrames, where it uses the bean metadata to infer the schema. -List localTraining = Lists.newArrayList( +DataFrame training = sqlContext.createDataFrame(Arrays.asList( new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), - new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))); -DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class); + new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)) +), LabeledPoint.class); // Create a LogisticRegression instance. This instance is an Estimator. LogisticRegression lr = new LogisticRegression(); @@ -319,14 +319,14 @@ LogisticRegressionModel model1 = lr.fit(training); System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap()); // We may alternatively specify parameters using a ParamMap. -ParamMap paramMap = new ParamMap(); -paramMap.put(lr.maxIter().w(20)); // Specify 1 Param. -paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter. -paramMap.put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params. +ParamMap paramMap = new ParamMap() + .put(lr.maxIter().w(20)) // Specify 1 Param. + .put(lr.maxIter(), 30) // This overwrites the original maxIter. + .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params. // One can also combine ParamMaps. -ParamMap paramMap2 = new ParamMap(); -paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name +ParamMap paramMap2 = new ParamMap() + .put(lr.probabilityCol().w("myProbability")); // Change output column name ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2); // Now learn a new model using the paramMapCombined parameters. @@ -335,11 +335,11 @@ LogisticRegressionModel model2 = lr.fit(training, paramMapCombined); System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); // Prepare test documents. -List localTest = Lists.newArrayList( - new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), - new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), - new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))); -DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class); +DataFrame test = sqlContext.createDataFrame(Arrays.asList( + new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), + new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), + new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)) +), LabeledPoint.class); // Make predictions on test documents using the Transformer.transform() method. // LogisticRegression.transform will only use the 'features' column. @@ -351,7 +351,68 @@ for (Row r: results.select("features", "label", "myProbability", "prediction").c + ", prediction=" + r.get(3)); } -jsc.stop(); +{% endhighlight %} +
+ +
+{% highlight python %} +from pyspark.mllib.linalg import Vectors +from pyspark.ml.classification import LogisticRegression +from pyspark.ml.param import Param, Params + +# Prepare training data from a list of (label, features) tuples. +training = sqlContext.createDataFrame([ + (1.0, Vectors.dense([0.0, 1.1, 0.1])), + (0.0, Vectors.dense([2.0, 1.0, -1.0])), + (0.0, Vectors.dense([2.0, 1.3, 1.0])), + (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) + +# Create a LogisticRegression instance. This instance is an Estimator. +lr = LogisticRegression(maxIter=10, regParam=0.01) +# Print out the parameters, documentation, and any default values. +print "LogisticRegression parameters:\n" + lr.explainParams() + "\n" + +# Learn a LogisticRegression model. This uses the parameters stored in lr. +model1 = lr.fit(training) + +# Since model1 is a Model (i.e., a transformer produced by an Estimator), +# we can view the parameters it used during fit(). +# This prints the parameter (name: value) pairs, where names are unique IDs for this +# LogisticRegression instance. +print "Model 1 was fit using parameters: " +print model1.extractParamMap() + +# We may alternatively specify parameters using a Python dictionary as a paramMap +paramMap = {lr.maxIter: 20} +paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter. +paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params. + +# You can combine paramMaps, which are python dictionaries. +paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name +paramMapCombined = paramMap.copy() +paramMapCombined.update(paramMap2) + +# Now learn a new model using the paramMapCombined parameters. +# paramMapCombined overrides all parameters set earlier via lr.set* methods. +model2 = lr.fit(training, paramMapCombined) +print "Model 2 was fit using parameters: " +print model2.extractParamMap() + +# Prepare test data +test = sqlContext.createDataFrame([ + (1.0, Vectors.dense([-1.0, 1.5, 1.3])), + (0.0, Vectors.dense([3.0, 2.0, -0.1])), + (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"]) + +# Make predictions on test data using the Transformer.transform() method. +# LogisticRegression.transform will only use the 'features' column. +# Note that model2.transform() outputs a "myProbability" column instead of the usual +# 'probability' column since we renamed the lr.probabilityCol parameter previously. +prediction = model2.transform(test) +selected = prediction.select("features", "label", "myProbability", "prediction") +for row in selected.collect(): + print row + {% endhighlight %}
@@ -365,30 +426,19 @@ This example follows the simple text document `Pipeline` illustrated in the figu
{% highlight scala %} -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector -import org.apache.spark.sql.{Row, SQLContext} - -// Labeled and unlabeled instance types. -// Spark SQL can infer schema from case classes. -case class LabeledDocument(id: Long, text: String, label: Double) -case class Document(id: Long, text: String) - -// Set up contexts. Import implicit conversions to DataFrame from sqlContext. -val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) -import sqlContext.implicits._ +import org.apache.spark.sql.Row -// Prepare training documents, which are labeled. -val training = sc.parallelize(Seq( - LabeledDocument(0L, "a b c d e spark", 1.0), - LabeledDocument(1L, "b d", 0.0), - LabeledDocument(2L, "spark f g h", 1.0), - LabeledDocument(3L, "hadoop mapreduce", 0.0))) +// Prepare training documents from a list of (id, text, label) tuples. +val training = sqlContext.createDataFrame(Seq( + (0L, "a b c d e spark", 1.0), + (1L, "b d", 0.0), + (2L, "spark f g h", 1.0), + (3L, "hadoop mapreduce", 0.0) +)).toDF("id", "text", "label") // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() @@ -405,14 +455,15 @@ val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. -val model = pipeline.fit(training.toDF) +val model = pipeline.fit(training) -// Prepare test documents, which are unlabeled. -val test = sc.parallelize(Seq( - Document(4L, "spark i j k"), - Document(5L, "l m n"), - Document(6L, "mapreduce spark"), - Document(7L, "apache hadoop"))) +// Prepare test documents, which are unlabeled (id, text) tuples. +val test = sqlContext.createDataFrame(Seq( + (4L, "spark i j k"), + (5L, "l m n"), + (6L, "mapreduce spark"), + (7L, "apache hadoop") +)).toDF("id", "text") // Make predictions on test documents. model.transform(test.toDF) @@ -422,16 +473,14 @@ model.transform(test.toDF) println(s"($id, $text) --> prob=$prob, prediction=$prediction") } -sc.stop() {% endhighlight %}
{% highlight java %} +import java.util.Arrays; import java.util.List; -import com.google.common.collect.Lists; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; + import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineModel; import org.apache.spark.ml.PipelineStage; @@ -440,7 +489,6 @@ import org.apache.spark.ml.feature.HashingTF; import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; // Labeled and unlabeled instance types. // Spark SQL can infer schema from Java Beans. @@ -472,18 +520,13 @@ public class LabeledDocument extends Document implements Serializable { public void setLabel(double label) { this.label = label; } } -// Set up contexts. -SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline"); -JavaSparkContext jsc = new JavaSparkContext(conf); -SQLContext jsql = new SQLContext(jsc); - // Prepare training documents, which are labeled. -List localTraining = Lists.newArrayList( +DataFrame training = sqlContext.createDataFrame(Arrays.asList( new LabeledDocument(0L, "a b c d e spark", 1.0), new LabeledDocument(1L, "b d", 0.0), new LabeledDocument(2L, "spark f g h", 1.0), - new LabeledDocument(3L, "hadoop mapreduce", 0.0)); -DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class); + new LabeledDocument(3L, "hadoop mapreduce", 0.0) +), LabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() @@ -503,12 +546,12 @@ Pipeline pipeline = new Pipeline() PipelineModel model = pipeline.fit(training); // Prepare test documents, which are unlabeled. -List localTest = Lists.newArrayList( +DataFrame test = sqlContext.createDataFrame(Arrays.asList( new Document(4L, "spark i j k"), new Document(5L, "l m n"), new Document(6L, "mapreduce spark"), - new Document(7L, "apache hadoop")); -DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class); + new Document(7L, "apache hadoop") +), Document.class); // Make predictions on test documents. DataFrame predictions = model.transform(test); @@ -517,28 +560,23 @@ for (Row r: predictions.select("id", "text", "probability", "prediction").collec + ", prediction=" + r.get(3)); } -jsc.stop(); {% endhighlight %}
{% highlight python %} -from pyspark import SparkContext from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer -from pyspark.sql import Row, SQLContext +from pyspark.sql import Row -sc = SparkContext(appName="SimpleTextClassificationPipeline") -sqlContext = SQLContext(sc) - -# Prepare training documents, which are labeled. +# Prepare training documents from a list of (id, text, label) tuples. LabeledDocument = Row("id", "text", "label") -training = sc.parallelize([(0L, "a b c d e spark", 1.0), - (1L, "b d", 0.0), - (2L, "spark f g h", 1.0), - (3L, "hadoop mapreduce", 0.0)]) \ - .map(lambda x: LabeledDocument(*x)).toDF() +training = sqlContext.createDataFrame([ + (0L, "a b c d e spark", 1.0), + (1L, "b d", 0.0), + (2L, "spark f g h", 1.0), + (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") @@ -549,13 +587,12 @@ pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) -# Prepare test documents, which are unlabeled. -Document = Row("id", "text") -test = sc.parallelize([(4L, "spark i j k"), - (5L, "l m n"), - (6L, "mapreduce spark"), - (7L, "apache hadoop")]) \ - .map(lambda x: Document(*x)).toDF() +# Prepare test documents, which are unlabeled (id, text) tuples. +test = sqlContext.createDataFrame([ + (4L, "spark i j k"), + (5L, "l m n"), + (6L, "mapreduce spark"), + (7L, "apache hadoop")], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) @@ -563,13 +600,12 @@ selected = prediction.select("id", "text", "prediction") for row in selected.collect(): print(row) -sc.stop() {% endhighlight %}
-## Example: Model Selection via Cross-Validation +## Example: model selection via cross-validation An important task in ML is *model selection*, or using data to find the best model or parameters for a given task. This is also called *tuning*. `Pipeline`s facilitate model selection by making it easy to tune an entire `Pipeline` at once, rather than tuning each element in the `Pipeline` separately. @@ -577,6 +613,13 @@ An important task in ML is *model selection*, or using data to find the best mod Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class, which takes an `Estimator`, a set of `ParamMap`s, and an [`Evaluator`](api/scala/index.html#org.apache.spark.ml.Evaluator). `CrossValidator` begins by splitting the dataset into a set of *folds* which are used as separate training and test datasets; e.g., with `$k=3$` folds, `CrossValidator` will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing. `CrossValidator` iterates through the set of `ParamMap`s. For each `ParamMap`, it trains the given `Estimator` and evaluates it using the given `Evaluator`. + +The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.RegressionEvaluator) +for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.BinaryClassificationEvaluator) +for binary data, or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.MultiClassClassificationEvaluator) +for multiclass problems. The default metric used to choose the best `ParamMap` can be overriden by the `setMetric` +method in each of these evaluators. + The `ParamMap` which produces the best evaluation metric (averaged over the `$k$` folds) is selected as the best model. `CrossValidator` finally fits the `Estimator` using the best `ParamMap` and the entire dataset. @@ -593,39 +636,29 @@ However, it is also a well-established method for choosing parameters which is m
{% highlight scala %} -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector -import org.apache.spark.sql.{Row, SQLContext} - -// Labeled and unlabeled instance types. -// Spark SQL can infer schema from case classes. -case class LabeledDocument(id: Long, text: String, label: Double) -case class Document(id: Long, text: String) - -val conf = new SparkConf().setAppName("CrossValidatorExample") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) -import sqlContext.implicits._ - -// Prepare training documents, which are labeled. -val training = sc.parallelize(Seq( - LabeledDocument(0L, "a b c d e spark", 1.0), - LabeledDocument(1L, "b d", 0.0), - LabeledDocument(2L, "spark f g h", 1.0), - LabeledDocument(3L, "hadoop mapreduce", 0.0), - LabeledDocument(4L, "b spark who", 1.0), - LabeledDocument(5L, "g d a y", 0.0), - LabeledDocument(6L, "spark fly", 1.0), - LabeledDocument(7L, "was mapreduce", 0.0), - LabeledDocument(8L, "e spark program", 1.0), - LabeledDocument(9L, "a e c l", 0.0), - LabeledDocument(10L, "spark compile", 1.0), - LabeledDocument(11L, "hadoop software", 0.0))) +import org.apache.spark.sql.Row + +// Prepare training data from a list of (id, text, label) tuples. +val training = sqlContext.createDataFrame(Seq( + (0L, "a b c d e spark", 1.0), + (1L, "b d", 0.0), + (2L, "spark f g h", 1.0), + (3L, "hadoop mapreduce", 0.0), + (4L, "b spark who", 1.0), + (5L, "g d a y", 0.0), + (6L, "spark fly", 1.0), + (7L, "was mapreduce", 0.0), + (8L, "e spark program", 1.0), + (9L, "a e c l", 0.0), + (10L, "spark compile", 1.0), + (11L, "hadoop software", 0.0) +)).toDF("id", "text", "label") // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() @@ -639,12 +672,6 @@ val lr = new LogisticRegression() val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) -// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. -// This will allow us to jointly choose parameters for all Pipeline stages. -// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. -val crossval = new CrossValidator() - .setEstimator(pipeline) - .setEvaluator(new BinaryClassificationEvaluator) // We use a ParamGridBuilder to construct a grid of parameters to search over. // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from. @@ -652,37 +679,45 @@ val paramGrid = new ParamGridBuilder() .addGrid(hashingTF.numFeatures, Array(10, 100, 1000)) .addGrid(lr.regParam, Array(0.1, 0.01)) .build() -crossval.setEstimatorParamMaps(paramGrid) -crossval.setNumFolds(2) // Use 3+ in practice + +// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. +// This will allow us to jointly choose parameters for all Pipeline stages. +// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. +// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric +// is areaUnderROC. +val cv = new CrossValidator() + .setEstimator(pipeline) + .setEvaluator(new BinaryClassificationEvaluator) + .setEstimatorParamMaps(paramGrid) + .setNumFolds(2) // Use 3+ in practice // Run cross-validation, and choose the best set of parameters. -val cvModel = crossval.fit(training.toDF) +val cvModel = cv.fit(training) -// Prepare test documents, which are unlabeled. -val test = sc.parallelize(Seq( - Document(4L, "spark i j k"), - Document(5L, "l m n"), - Document(6L, "mapreduce spark"), - Document(7L, "apache hadoop"))) +// Prepare test documents, which are unlabeled (id, text) tuples. +val test = sqlContext.createDataFrame(Seq( + (4L, "spark i j k"), + (5L, "l m n"), + (6L, "mapreduce spark"), + (7L, "apache hadoop") +)).toDF("id", "text") // Make predictions on test documents. cvModel uses the best model found (lrModel). -cvModel.transform(test.toDF) +cvModel.transform(test) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => - println(s"($id, $text) --> prob=$prob, prediction=$prediction") -} + println(s"($id, $text) --> prob=$prob, prediction=$prediction") + } -sc.stop() {% endhighlight %}
{% highlight java %} +import java.util.Arrays; import java.util.List; -import com.google.common.collect.Lists; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; + import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineStage; import org.apache.spark.ml.classification.LogisticRegression; @@ -695,7 +730,6 @@ import org.apache.spark.ml.tuning.CrossValidatorModel; import org.apache.spark.ml.tuning.ParamGridBuilder; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; // Labeled and unlabeled instance types. // Spark SQL can infer schema from Java Beans. @@ -727,12 +761,9 @@ public class LabeledDocument extends Document implements Serializable { public void setLabel(double label) { this.label = label; } } -SparkConf conf = new SparkConf().setAppName("JavaCrossValidatorExample"); -JavaSparkContext jsc = new JavaSparkContext(conf); -SQLContext jsql = new SQLContext(jsc); // Prepare training documents, which are labeled. -List localTraining = Lists.newArrayList( +DataFrame training = sqlContext.createDataFrame(Arrays.asList( new LabeledDocument(0L, "a b c d e spark", 1.0), new LabeledDocument(1L, "b d", 0.0), new LabeledDocument(2L, "spark f g h", 1.0), @@ -744,8 +775,8 @@ List localTraining = Lists.newArrayList( new LabeledDocument(8L, "e spark program", 1.0), new LabeledDocument(9L, "a e c l", 0.0), new LabeledDocument(10L, "spark compile", 1.0), - new LabeledDocument(11L, "hadoop software", 0.0)); -DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class); + new LabeledDocument(11L, "hadoop software", 0.0) +), LabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() @@ -761,12 +792,6 @@ LogisticRegression lr = new LogisticRegression() Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); -// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. -// This will allow us to jointly choose parameters for all Pipeline stages. -// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. -CrossValidator crossval = new CrossValidator() - .setEstimator(pipeline) - .setEvaluator(new BinaryClassificationEvaluator()); // We use a ParamGridBuilder to construct a grid of parameters to search over. // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from. @@ -774,19 +799,28 @@ ParamMap[] paramGrid = new ParamGridBuilder() .addGrid(hashingTF.numFeatures(), new int[]{10, 100, 1000}) .addGrid(lr.regParam(), new double[]{0.1, 0.01}) .build(); -crossval.setEstimatorParamMaps(paramGrid); -crossval.setNumFolds(2); // Use 3+ in practice + +// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. +// This will allow us to jointly choose parameters for all Pipeline stages. +// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. +// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric +// is areaUnderROC. +CrossValidator cv = new CrossValidator() + .setEstimator(pipeline) + .setEvaluator(new BinaryClassificationEvaluator()) + .setEstimatorParamMaps(paramGrid) + .setNumFolds(2); // Use 3+ in practice // Run cross-validation, and choose the best set of parameters. -CrossValidatorModel cvModel = crossval.fit(training); +CrossValidatorModel cvModel = cv.fit(training); // Prepare test documents, which are unlabeled. -List localTest = Lists.newArrayList( +DataFrame test = sqlContext.createDataFrame(Arrays.asList( new Document(4L, "spark i j k"), new Document(5L, "l m n"), new Document(6L, "mapreduce spark"), - new Document(7L, "apache hadoop")); -DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class); + new Document(7L, "apache hadoop") +), Document.class); // Make predictions on test documents. cvModel uses the best model found (lrModel). DataFrame predictions = cvModel.transform(test); @@ -795,40 +829,121 @@ for (Row r: predictions.select("id", "text", "probability", "prediction").collec + ", prediction=" + r.get(3)); } -jsc.stop(); {% endhighlight %}
-# Dependencies +## Example: model selection via train validation split +In addition to `CrossValidator` Spark also offers `TrainValidationSplit` for hyper-parameter tuning. +`TrainValidationSplit` only evaluates each combination of parameters once as opposed to k times in + case of `CrossValidator`. It is therefore less expensive, + but will not produce as reliable results when the training dataset is not sufficiently large. + +`TrainValidationSplit` takes an `Estimator`, a set of `ParamMap`s provided in the `estimatorParamMaps` parameter, +and an `Evaluator`. +It begins by splitting the dataset into two parts using `trainRatio` parameter +which are used as separate training and test datasets. For example with `$trainRatio=0.75$` (default), +`TrainValidationSplit` will generate a training and test dataset pair where 75% of the data is used for training and 25% for validation. +Similar to `CrossValidator`, `TrainValidationSplit` also iterates through the set of `ParamMap`s. +For each combination of parameters, it trains the given `Estimator` and evaluates it using the given `Evaluator`. +The `ParamMap` which produces the best evaluation metric is selected as the best option. +`TrainValidationSplit` finally fits the `Estimator` using the best `ParamMap` and the entire dataset. + +
-Spark ML currently depends on MLlib and has the same dependencies. -Please see the [MLlib Dependencies guide](mllib-guide.html#dependencies) for more info. +
+{% highlight scala %} +import org.apache.spark.ml.evaluation.RegressionEvaluator +import org.apache.spark.ml.regression.LinearRegression +import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} +import org.apache.spark.mllib.util.MLUtils -Spark ML also depends upon Spark SQL, but the relevant parts of Spark SQL do not bring additional dependencies. +// Prepare training and test data. +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345) -# Migration Guide +val lr = new LinearRegression() -## From 1.3 to 1.4 +// We use a ParamGridBuilder to construct a grid of parameters to search over. +// TrainValidationSplit will try all combinations of values and determine best model using +// the evaluator. +val paramGrid = new ParamGridBuilder() + .addGrid(lr.regParam, Array(0.1, 0.01)) + .addGrid(lr.fitIntercept) + .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0)) + .build() -Several major API changes occurred, including: -* `Param` and other APIs for specifying parameters -* `uid` unique IDs for Pipeline components -* Reorganization of certain classes -Since the `spark.ml` API was an Alpha Component in Spark 1.3, we do not list all changes here. +// In this case the estimator is simply the linear regression. +// A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. +val trainValidationSplit = new TrainValidationSplit() + .setEstimator(lr) + .setEvaluator(new RegressionEvaluator) + .setEstimatorParamMaps(paramGrid) + // 80% of the data will be used for training and the remaining 20% for validation. + .setTrainRatio(0.8) -However, now that `spark.ml` is no longer an Alpha Component, we will provide details on any API changes for future releases. +// Run train validation split, and choose the best set of parameters. +val model = trainValidationSplit.fit(training) -## From 1.2 to 1.3 +// Make predictions on test data. model is the model with combination of parameters +// that performed best. +model.transform(test) + .select("features", "label", "prediction") + .show() -The main API changes are from Spark SQL. We list the most important changes here: +{% endhighlight %} +
-* The old [SchemaRDD](http://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD) has been replaced with [DataFrame](api/scala/index.html#org.apache.spark.sql.DataFrame) with a somewhat modified API. All algorithms in Spark ML which used to use SchemaRDD now use DataFrame. -* In Spark 1.2, we used implicit conversions from `RDD`s of `LabeledPoint` into `SchemaRDD`s by calling `import sqlContext._` where `sqlContext` was an instance of `SQLContext`. These implicits have been moved, so we now call `import sqlContext.implicits._`. -* Java APIs for SQL have also changed accordingly. Please see the examples above and the [Spark SQL Programming Guide](sql-programming-guide.html) for details. +
+{% highlight java %} +import org.apache.spark.ml.evaluation.RegressionEvaluator; +import org.apache.spark.ml.param.ParamMap; +import org.apache.spark.ml.regression.LinearRegression; +import org.apache.spark.ml.tuning.*; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.DataFrame; + +DataFrame data = sqlContext.createDataFrame( + MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"), + LabeledPoint.class); + +// Prepare training and test data. +DataFrame[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345); +DataFrame training = splits[0]; +DataFrame test = splits[1]; + +LinearRegression lr = new LinearRegression(); + +// We use a ParamGridBuilder to construct a grid of parameters to search over. +// TrainValidationSplit will try all combinations of values and determine best model using +// the evaluator. +ParamMap[] paramGrid = new ParamGridBuilder() + .addGrid(lr.regParam(), new double[] {0.1, 0.01}) + .addGrid(lr.fitIntercept()) + .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.5, 1.0}) + .build(); + +// In this case the estimator is simply the linear regression. +// A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. +TrainValidationSplit trainValidationSplit = new TrainValidationSplit() + .setEstimator(lr) + .setEvaluator(new RegressionEvaluator()) + .setEstimatorParamMaps(paramGrid) + .setTrainRatio(0.8); // 80% for training and the remaining 20% for validation + +// Run train validation split, and choose the best set of parameters. +TrainValidationSplitModel model = trainValidationSplit.fit(training); + +// Make predictions on test data. model is the model with combination of parameters +// that performed best. +model.transform(test) + .select("features", "label", "prediction") + .show(); -Other changes were in `LogisticRegression`: +{% endhighlight %} +
-* The `scoreCol` output column (with default value "score") was renamed to be `probabilityCol` (with default value "probability"). The type was originally `Double` (for the probability of class 1.0), but it is now `Vector` (for the probability of each class, to support multiclass classification in the future). -* In Spark 1.2, `LogisticRegressionModel` did not include an intercept. In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for [spark.mllib.LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS). The option to use an intercept will be added in the future. +
diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md index 1ac83d94c9e8..cdd9d4999fa1 100644 --- a/docs/ml-linear-methods.md +++ b/docs/ml-linear-methods.md @@ -23,20 +23,41 @@ displayTitle: ML - Linear Methods \]` -In MLlib, we implement popular linear methods such as logistic regression and linear least squares with L1 or L2 regularization. Refer to [the linear methods in mllib](mllib-linear-methods.html) for details. In `spark.ml`, we also include Pipelines API for [Elastic net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid of L1 and L2 regularization proposed in [this paper](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf). Mathematically it is defined as a linear combination of the L1-norm and the L2-norm: +In MLlib, we implement popular linear methods such as logistic +regression and linear least squares with $L_1$ or $L_2$ regularization. +Refer to [the linear methods in mllib](mllib-linear-methods.html) for +details. In `spark.ml`, we also include Pipelines API for [Elastic +net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid +of $L_1$ and $L_2$ regularization proposed in [Zou et al, Regularization +and variable selection via the elastic +net](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf). +Mathematically, it is defined as a convex combination of the $L_1$ and +the $L_2$ regularization terms: `\[ -\alpha \|\wv\|_1 + (1-\alpha) \frac{1}{2}\|\wv\|_2^2, \alpha \in [0, 1]. +\alpha \left( \lambda \|\wv\|_1 \right) + (1-\alpha) \left( \frac{\lambda}{2}\|\wv\|_2^2 \right) , \alpha \in [0, 1], \lambda \geq 0 \]` -By setting $\alpha$ properly, it contains both L1 and L2 regularization as special cases. For example, if a [linear regression](https://en.wikipedia.org/wiki/Linear_regression) model is trained with the elastic net parameter $\alpha$ set to $1$, it is equivalent to a [Lasso](http://en.wikipedia.org/wiki/Least_squares#Lasso_method) model. On the other hand, if $\alpha$ is set to $0$, the trained model reduces to a [ridge regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model. We implement Pipelines API for both linear regression and logistic regression with elastic net regularization. - -**Examples** +By setting $\alpha$ properly, elastic net contains both $L_1$ and $L_2$ +regularization as special cases. For example, if a [linear +regression](https://en.wikipedia.org/wiki/Linear_regression) model is +trained with the elastic net parameter $\alpha$ set to $1$, it is +equivalent to a +[Lasso](http://en.wikipedia.org/wiki/Least_squares#Lasso_method) model. +On the other hand, if $\alpha$ is set to $0$, the trained model reduces +to a [ridge +regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model. +We implement Pipelines API for both linear regression and logistic +regression with elastic net regularization. + +## Example: Logistic Regression + +The following example shows how to train a logistic regression model +with elastic net regularization. `elasticNetParam` corresponds to +$\alpha$ and `regParam` corresponds to $\lambda$.
- {% highlight scala %} - import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.mllib.util.MLUtils @@ -53,15 +74,11 @@ val lrModel = lr.fit(training) // Print the weights and intercept for logistic regression println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}") - {% endhighlight %} -
- {% highlight java %} - import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.LogisticRegressionModel; import org.apache.spark.mllib.regression.LabeledPoint; @@ -78,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample { SparkContext sc = new SparkContext(conf); SQLContext sql = new SQLContext(sc); - String path = "sample_libsvm_data.txt"; + String path = "data/mllib/sample_libsvm_data.txt"; // Load training data DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class); @@ -86,7 +103,7 @@ public class LogisticRegressionWithElasticNetExample { LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.3) - .setElasticNetParam(0.8) + .setElasticNetParam(0.8); // Fit the model LogisticRegressionModel lrModel = lr.fit(training); @@ -99,9 +116,7 @@ public class LogisticRegressionWithElasticNetExample {
- {% highlight python %} - from pyspark.ml.classification import LogisticRegression from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.util import MLUtils @@ -118,12 +133,228 @@ lrModel = lr.fit(training) print("Weights: " + str(lrModel.weights)) print("Intercept: " + str(lrModel.intercept)) {% endhighlight %} +
+The `spark.ml` implementation of logistic regression also supports +extracting a summary of the model over the training set. Note that the +predictions and metrics which are stored as `Dataframe` in +`BinaryLogisticRegressionSummary` are annotated `@transient` and hence +only available on the driver. + +
+ +
+ +[`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionTrainingSummary) +provides a summary for a +[`LogisticRegressionModel`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionModel). +Currently, only binary classification is supported and the +summary must be explicitly cast to +[`BinaryLogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary). +This will likely change when multiclass classification is supported. + +Continuing the earlier example: + +{% highlight scala %} +import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary + +// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example +val trainingSummary = lrModel.summary + +// Obtain the objective per iteration. +val objectiveHistory = trainingSummary.objectiveHistory +objectiveHistory.foreach(loss => println(loss)) + +// Obtain the metrics useful to judge performance on test data. +// We cast the summary to a BinaryLogisticRegressionSummary since the problem is a +// binary classification problem. +val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary] + +// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. +val roc = binarySummary.roc +roc.show() +println(binarySummary.areaUnderROC) + +// Set the model threshold to maximize F-Measure +val fMeasure = binarySummary.fMeasureByThreshold +val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0) +val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure). + select("threshold").head().getDouble(0) +lrModel.setThreshold(bestThreshold) +{% endhighlight %}
-### Optimization +
+[`LogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary.html) +provides a summary for a +[`LogisticRegressionModel`](api/java/org/apache/spark/ml/classification/LogisticRegressionModel.html). +Currently, only binary classification is supported and the +summary must be explicitly cast to +[`BinaryLogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/BinaryLogisticRegressionTrainingSummary.html). +This will likely change when multiclass classification is supported. + +Continuing the earlier example: + +{% highlight java %} +import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary; +import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary; +import org.apache.spark.sql.functions; + +// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example +LogisticRegressionTrainingSummary trainingSummary = lrModel.summary(); + +// Obtain the loss per iteration. +double[] objectiveHistory = trainingSummary.objectiveHistory(); +for (double lossPerIteration : objectiveHistory) { + System.out.println(lossPerIteration); +} + +// Obtain the metrics useful to judge performance on test data. +// We cast the summary to a BinaryLogisticRegressionSummary since the problem is a +// binary classification problem. +BinaryLogisticRegressionSummary binarySummary = (BinaryLogisticRegressionSummary) trainingSummary; + +// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. +DataFrame roc = binarySummary.roc(); +roc.show(); +roc.select("FPR").show(); +System.out.println(binarySummary.areaUnderROC()); + +// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with +// this selected threshold. +DataFrame fMeasure = binarySummary.fMeasureByThreshold(); +double maxFMeasure = fMeasure.select(functions.max("F-Measure")).head().getDouble(0); +double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)). + select("threshold").head().getDouble(0); +lrModel.setThreshold(bestThreshold); +{% endhighlight %} +
+ + +
+Logistic regression model summary is not yet supported in Python. +
+ +
+ +## Example: Linear Regression + +The interface for working with linear regression models and model +summaries is similar to the logistic regression case. The following +example demonstrates training an elastic net regularized linear +regression model and extracting model summary statistics. + +
+ +
+{% highlight scala %} +import org.apache.spark.ml.regression.LinearRegression +import org.apache.spark.mllib.util.MLUtils + +// Load training data +val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +val lr = new LinearRegression() + .setMaxIter(10) + .setRegParam(0.3) + .setElasticNetParam(0.8) + +// Fit the model +val lrModel = lr.fit(training) + +// Print the weights and intercept for linear regression +println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}") + +// Summarize the model over the training set and print out some metrics +val trainingSummary = lrModel.summary +println(s"numIterations: ${trainingSummary.totalIterations}") +println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}") +trainingSummary.residuals.show() +println(s"RMSE: ${trainingSummary.rootMeanSquaredError}") +println(s"r2: ${trainingSummary.r2}") +{% endhighlight %} +
+ +
+{% highlight java %} +import org.apache.spark.ml.regression.LinearRegression; +import org.apache.spark.ml.regression.LinearRegressionModel; +import org.apache.spark.ml.regression.LinearRegressionTrainingSummary; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.SQLContext; + +public class LinearRegressionWithElasticNetExample { + public static void main(String[] args) { + SparkConf conf = new SparkConf() + .setAppName("Linear Regression with Elastic Net Example"); + + SparkContext sc = new SparkContext(conf); + SQLContext sql = new SQLContext(sc); + String path = "data/mllib/sample_libsvm_data.txt"; + + // Load training data + DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class); + + LinearRegression lr = new LinearRegression() + .setMaxIter(10) + .setRegParam(0.3) + .setElasticNetParam(0.8); + + // Fit the model + LinearRegressionModel lrModel = lr.fit(training); + + // Print the weights and intercept for linear regression + System.out.println("Weights: " + lrModel.weights() + " Intercept: " + lrModel.intercept()); + + // Summarize the model over the training set and print out some metrics + LinearRegressionTrainingSummary trainingSummary = lrModel.summary(); + System.out.println("numIterations: " + trainingSummary.totalIterations()); + System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory())); + trainingSummary.residuals().show(); + System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError()); + System.out.println("r2: " + trainingSummary.r2()); + } +} +{% endhighlight %} +
+ +
+ +{% highlight python %} +from pyspark.ml.regression import LinearRegression +from pyspark.mllib.regression import LabeledPoint +from pyspark.mllib.util import MLUtils + +# Load training data +training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) + +# Fit the model +lrModel = lr.fit(training) + +# Print the weights and intercept for linear regression +print("Weights: " + str(lrModel.weights)) +print("Intercept: " + str(lrModel.intercept)) + +# Linear regression model summary is not yet supported in Python. +{% endhighlight %} +
+ +
+ +# Optimization + +The optimization algorithm underlying the implementation is called +[Orthant-Wise Limited-memory +QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) +(OWL-QN). It is an extension of L-BFGS that can effectively handle L1 +regularization and elastic net. -The optimization algorithm underlies the implementation is called [Orthant-Wise Limited-memory QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) -(OWL-QN). It is an extension of L-BFGS that can effectively handle L1 regularization and elastic net. diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index bb875ae2ae6c..3fb35d3c50b0 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -438,28 +438,125 @@ sameModel = PowerIterationClusteringModel.load(sc, "myModelPath") is a topic model which infers topics from a collection of text documents. LDA can be thought of as a clustering algorithm as follows: -* Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset. -* Topics and documents both exist in a feature space, where feature vectors are vectors of word counts. -* Rather than estimating a clustering using a traditional distance, LDA uses a function based - on a statistical model of how text documents are generated. - -LDA takes in a collection of documents as vectors of word counts. -It supports different inference algorithms via `setOptimizer` function. EMLDAOptimizer learns clustering using [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) -on the likelihood function and yields comprehensive results, while OnlineLDAOptimizer uses iterative mini-batch sampling for [online variational inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) and is generally memory friendly. After fitting on the documents, LDA provides: - -* Topics: Inferred topics, each of which is a probability distribution over terms (words). -* Topic distributions for documents: For each non empty document in the training set, LDA gives a probability distribution over topics. (EM only). Note that for empty documents, we don't create the topic distributions. (EM only) +* Topics correspond to cluster centers, and documents correspond to +examples (rows) in a dataset. +* Topics and documents both exist in a feature space, where feature +vectors are vectors of word counts (bag of words). +* Rather than estimating a clustering using a traditional distance, LDA +uses a function based on a statistical model of how text documents are +generated. + +LDA supports different inference algorithms via `setOptimizer` function. +`EMLDAOptimizer` learns clustering using +[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) +on the likelihood function and yields comprehensive results, while +`OnlineLDAOptimizer` uses iterative mini-batch sampling for [online +variational +inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) +and is generally memory friendly. -LDA takes the following parameters: +LDA takes in a collection of documents as vectors of word counts and the +following parameters (set using the builder pattern): * `k`: Number of topics (i.e., cluster centers) -* `maxIterations`: Limit on the number of iterations of EM used for learning -* `docConcentration`: Hyperparameter for prior over documents' distributions over topics. Currently must be > 1, where larger values encourage smoother inferred distributions. -* `topicConcentration`: Hyperparameter for prior over topics' distributions over terms (words). Currently must be > 1, where larger values encourage smoother inferred distributions. -* `checkpointInterval`: If using checkpointing (set in the Spark configuration), this parameter specifies the frequency with which checkpoints will be created. If `maxIterations` is large, using checkpointing can help reduce shuffle file sizes on disk and help with failure recovery. - -*Note*: LDA is a new feature with some missing functionality. In particular, it does not yet -support prediction on new documents, and it does not have a Python API. These will be added in the future. +* `optimizer`: Optimizer to use for learning the LDA model, either +`EMLDAOptimizer` or `OnlineLDAOptimizer` +* `docConcentration`: Dirichlet parameter for prior over documents' +distributions over topics. Larger values encourage smoother inferred +distributions. +* `topicConcentration`: Dirichlet parameter for prior over topics' +distributions over terms (words). Larger values encourage smoother +inferred distributions. +* `maxIterations`: Limit on the number of iterations. +* `checkpointInterval`: If using checkpointing (set in the Spark +configuration), this parameter specifies the frequency with which +checkpoints will be created. If `maxIterations` is large, using +checkpointing can help reduce shuffle file sizes on disk and help with +failure recovery. + + +All of MLlib's LDA models support: + +* `describeTopics`: Returns topics as arrays of most important terms and +term weights +* `topicsMatrix`: Returns a `vocabSize` by `k` matrix where each column +is a topic + +*Note*: LDA is still an experimental feature under active development. +As a result, certain features are only available in one of the two +optimizers / models generated by the optimizer. Currently, a distributed +model can be converted into a local model, but not vice-versa. + +The following discussion will describe each optimizer/model pair +separately. + +**Expectation Maximization** + +Implemented in +[`EMLDAOptimizer`](api/scala/index.html#org.apache.spark.mllib.clustering.EMLDAOptimizer) +and +[`DistributedLDAModel`](api/scala/index.html#org.apache.spark.mllib.clustering.DistributedLDAModel). + +For the parameters provided to `LDA`: + +* `docConcentration`: Only symmetric priors are supported, so all values +in the provided `k`-dimensional vector must be identical. All values +must also be $> 1.0$. Providing `Vector(-1)` results in default behavior +(uniform `k` dimensional vector with value $(50 / k) + 1$ +* `topicConcentration`: Only symmetric priors supported. Values must be +$> 1.0$. Providing `-1` results in defaulting to a value of $0.1 + 1$. +* `maxIterations`: The maximum number of EM iterations. + +`EMLDAOptimizer` produces a `DistributedLDAModel`, which stores not only +the inferred topics but also the full training corpus and topic +distributions for each document in the training corpus. A +`DistributedLDAModel` supports: + + * `topTopicsPerDocument`: The top topics and their weights for + each document in the training corpus + * `topDocumentsPerTopic`: The top documents for each topic and + the corresponding weight of the topic in the documents. + * `logPrior`: log probability of the estimated topics and + document-topic distributions given the hyperparameters + `docConcentration` and `topicConcentration` + * `logLikelihood`: log likelihood of the training corpus, given the + inferred topics and document-topic distributions + +**Online Variational Bayes** + +Implemented in +[`OnlineLDAOptimizer`](api/scala/org/apache/spark/mllib/clustering/OnlineLDAOptimizer.html) +and +[`LocalLDAModel`](api/scala/org/apache/spark/mllib/clustering/LocalLDAModel.html). + +For the parameters provided to `LDA`: + +* `docConcentration`: Asymmetric priors can be used by passing in a +vector with values equal to the Dirichlet parameter in each of the `k` +dimensions. Values should be $>= 0$. Providing `Vector(-1)` results in +default behavior (uniform `k` dimensional vector with value $(1.0 / k)$) +* `topicConcentration`: Only symmetric priors supported. Values must be +$>= 0$. Providing `-1` results in defaulting to a value of $(1.0 / k)$. +* `maxIterations`: Maximum number of minibatches to submit. + +In addition, `OnlineLDAOptimizer` accepts the following parameters: + +* `miniBatchFraction`: Fraction of corpus sampled and used at each +iteration +* `optimizeDocConcentration`: If set to true, performs maximum-likelihood +estimation of the hyperparameter `docConcentration` (aka `alpha`) +after each minibatch and sets the optimized `docConcentration` in the +returned `LocalLDAModel` +* `tau0` and `kappa`: Used for learning-rate decay, which is computed by +$(\tau_0 + iter)^{-\kappa}$ where $iter$ is the current number of iterations. + +`OnlineLDAOptimizer` produces a `LocalLDAModel`, which only stores the +inferred topics. A `LocalLDAModel` supports: + +* `logLikelihood(documents)`: Calculates a lower bound on the provided +`documents` given the inferred topics. +* `logPerplexity(documents)`: Calculates an upper bound on the +perplexity of the provided `documents` given the inferred topics. **Examples** @@ -564,6 +661,34 @@ public class JavaLDAExample { {% endhighlight %}
+
+{% highlight python %} +from pyspark.mllib.clustering import LDA, LDAModel +from pyspark.mllib.linalg import Vectors + +# Load and parse the data +data = sc.textFile("data/mllib/sample_lda_data.txt") +parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) +# Index documents with unique IDs +corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() + +# Cluster the documents into three topics using LDA +ldaModel = LDA.train(corpus, k=3) + +# Output topics. Each is a distribution over words (matching word count vectors) +print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") +topics = ldaModel.topicsMatrix() +for topic in range(3): + print("Topic " + str(topic) + ":") + for word in range(0, ldaModel.vocabSize()): + print(" " + str(topics[word][topic])) + +# Save and load model +model.save(sc, "myModelPath") +sameModel = LDAModel.load(sc, "myModelPath") +{% endhighlight %} +
+
## Streaming k-means diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md index 3aa040046fca..d8c7bdc63c70 100644 --- a/docs/mllib-data-types.md +++ b/docs/mllib-data-types.md @@ -144,7 +144,7 @@ import org.apache.spark.mllib.regression.LabeledPoint; LabeledPoint pos = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)); // Create a labeled point with a negative label and a sparse feature vector. -LabeledPoint neg = new LabeledPoint(1.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0})); +LabeledPoint neg = new LabeledPoint(0.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0})); {% endhighlight %}
@@ -337,7 +337,10 @@ limited by the integer range but it should be much smaller in practice.
A [`RowMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) can be -created from an `RDD[Vector]` instance. Then we can compute its column summary statistics. +created from an `RDD[Vector]` instance. Then we can compute its column summary statistics and decompositions. +[QR decomposition](https://en.wikipedia.org/wiki/QR_decomposition) is of the form A = QR where Q is an orthogonal matrix and R is an upper triangular matrix. +For [singular value decomposition (SVD)](https://en.wikipedia.org/wiki/Singular_value_decomposition) and [principal component analysis (PCA)](https://en.wikipedia.org/wiki/Principal_component_analysis), please refer to [Dimensionality reduction](mllib-dimensionality-reduction.html). + {% highlight scala %} import org.apache.spark.mllib.linalg.Vector @@ -350,6 +353,9 @@ val mat: RowMatrix = new RowMatrix(rows) // Get its size. val m = mat.numRows() val n = mat.numCols() + +// QR decomposition +val qrResult = mat.tallSkinnyQR(true) {% endhighlight %}
@@ -370,14 +376,42 @@ RowMatrix mat = new RowMatrix(rows.rdd()); // Get its size. long m = mat.numRows(); long n = mat.numCols(); + +// QR decomposition +QRDecomposition result = mat.tallSkinnyQR(true); +{% endhighlight %} +
+ +
+ +A [`RowMatrix`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.RowMatrix) can be +created from an `RDD` of vectors. + +{% highlight python %} +from pyspark.mllib.linalg.distributed import RowMatrix + +# Create an RDD of vectors. +rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) + +# Create a RowMatrix from an RDD of vectors. +mat = RowMatrix(rows) + +# Get its size. +m = mat.numRows() # 4 +n = mat.numCols() # 3 + +# Get the rows as an RDD of vectors again. +rowsRDD = mat.rows {% endhighlight %}
+
### IndexedRowMatrix An `IndexedRowMatrix` is similar to a `RowMatrix` but with meaningful row indices. It is backed by -an RDD of indexed rows, so that each row is represented by its index (long-typed) and a local vector. +an RDD of indexed rows, so that each row is represented by its index (long-typed) and a local +vector.
@@ -431,7 +465,51 @@ long n = mat.numCols(); // Drop its row indices. RowMatrix rowMat = mat.toRowMatrix(); {% endhighlight %} -
+
+ +
+ +An [`IndexedRowMatrix`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix) +can be created from an `RDD` of `IndexedRow`s, where +[`IndexedRow`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.IndexedRow) is a +wrapper over `(long, vector)`. An `IndexedRowMatrix` can be converted to a `RowMatrix` by dropping +its row indices. + +{% highlight python %} +from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix + +# Create an RDD of indexed rows. +# - This can be done explicitly with the IndexedRow class: +indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]), + IndexedRow(1, [4, 5, 6]), + IndexedRow(2, [7, 8, 9]), + IndexedRow(3, [10, 11, 12])]) +# - or by using (long, vector) tuples: +indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]), + (2, [7, 8, 9]), (3, [10, 11, 12])]) + +# Create an IndexedRowMatrix from an RDD of IndexedRows. +mat = IndexedRowMatrix(indexedRows) + +# Get its size. +m = mat.numRows() # 4 +n = mat.numCols() # 3 + +# Get the rows as an RDD of IndexedRows. +rowsRDD = mat.rows + +# Convert to a RowMatrix by dropping the row indices. +rowMat = mat.toRowMatrix() + +# Convert to a CoordinateMatrix. +coordinateMat = mat.toCoordinateMatrix() + +# Convert to a BlockMatrix. +blockMat = mat.toBlockMatrix() +{% endhighlight %} +
+ + ### CoordinateMatrix @@ -495,6 +573,45 @@ long n = mat.numCols(); IndexedRowMatrix indexedRowMatrix = mat.toIndexedRowMatrix(); {% endhighlight %} + +
+ +A [`CoordinateMatrix`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.CoordinateMatrix) +can be created from an `RDD` of `MatrixEntry` entries, where +[`MatrixEntry`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.MatrixEntry) is a +wrapper over `(long, long, float)`. A `CoordinateMatrix` can be converted to a `RowMatrix` by +calling `toRowMatrix`, or to an `IndexedRowMatrix` with sparse rows by calling `toIndexedRowMatrix`. + +{% highlight python %} +from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry + +# Create an RDD of coordinate entries. +# - This can be done explicitly with the MatrixEntry class: +entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)]) +# - or using (long, long, float) tuples: +entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)]) + +# Create an CoordinateMatrix from an RDD of MatrixEntries. +mat = CoordinateMatrix(entries) + +# Get its size. +m = mat.numRows() # 3 +n = mat.numCols() # 2 + +# Get the entries as an RDD of MatrixEntries. +entriesRDD = mat.entries + +# Convert to a RowMatrix. +rowMat = mat.toRowMatrix() + +# Convert to an IndexedRowMatrix. +indexedRowMat = mat.toIndexedRowMatrix() + +# Convert to a BlockMatrix. +blockMat = mat.toBlockMatrix() +{% endhighlight %} +
+ ### BlockMatrix @@ -559,4 +676,39 @@ matA.validate(); BlockMatrix ata = matA.transpose().multiply(matA); {% endhighlight %} + +
+ +A [`BlockMatrix`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.BlockMatrix) +can be created from an `RDD` of sub-matrix blocks, where a sub-matrix block is a +`((blockRowIndex, blockColIndex), sub-matrix)` tuple. + +{% highlight python %} +from pyspark.mllib.linalg import Matrices +from pyspark.mllib.linalg.distributed import BlockMatrix + +# Create an RDD of sub-matrix blocks. +blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + +# Create a BlockMatrix from an RDD of sub-matrix blocks. +mat = BlockMatrix(blocks, 3, 2) + +# Get its size. +m = mat.numRows() # 6 +n = mat.numCols() # 2 + +# Get the blocks as an RDD of sub-matrix blocks. +blocksRDD = mat.blocks + +# Convert to a LocalMatrix. +localMat = mat.toLocalMatrix() + +# Convert to an IndexedRowMatrix. +indexedRowMat = mat.toIndexedRowMatrix() + +# Convert to a CoordinateMatrix. +coordinateMat = mat.toCoordinateMatrix() +{% endhighlight %} +
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md index 7521fb14a7bd..1e00b2083ed7 100644 --- a/docs/mllib-ensembles.md +++ b/docs/mllib-ensembles.md @@ -9,7 +9,7 @@ displayTitle: MLlib - Ensembles An [ensemble method](http://en.wikipedia.org/wiki/Ensemble_learning) is a learning algorithm which creates a model composed of a set of other base models. -MLlib supports two major ensemble algorithms: [`GradientBoostedTrees`](api/scala/index.html#org.apache.spark.mllib.tree.GradientBosotedTrees) and [`RandomForest`](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest). +MLlib supports two major ensemble algorithms: [`GradientBoostedTrees`](api/scala/index.html#org.apache.spark.mllib.tree.GradientBoostedTrees) and [`RandomForest`](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest). Both use [decision trees](mllib-decision-tree.html) as their base models. ## Gradient-Boosted Trees vs. Random Forests diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md index bcc066a18552..4d4f5cfdc564 100644 --- a/docs/mllib-frequent-pattern-mining.md +++ b/docs/mllib-frequent-pattern-mining.md @@ -41,16 +41,23 @@ MLlib's FP-growth implementation takes the following (hyper-)parameters: [`FPGrowth`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowth) implements the FP-growth algorithm. -It take a `JavaRDD` of transactions, where each transaction is an `Iterable` of items of a generic type. +It take a `RDD` of transactions, where each transaction is an `Array` of items of a generic type. Calling `FPGrowth.run` with transactions returns an [`FPGrowthModel`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowthModel) -that stores the frequent itemsets with their frequencies. +that stores the frequent itemsets with their frequencies. The following +example illustrates how to mine frequent itemsets and association rules +(see [Association +Rules](mllib-frequent-pattern-mining.html#association-rules) for +details) from `transactions`. + {% highlight scala %} import org.apache.spark.rdd.RDD -import org.apache.spark.mllib.fpm.{FPGrowth, FPGrowthModel} +import org.apache.spark.mllib.fpm.FPGrowth + +val data = sc.textFile("data/mllib/sample_fpgrowth.txt") -val transactions: RDD[Array[String]] = ... +val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' ')) val fpg = new FPGrowth() .setMinSupport(0.2) @@ -60,6 +67,14 @@ val model = fpg.run(transactions) model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } + +val minConfidence = 0.8 +model.generateAssociationRules(minConfidence).collect().foreach { rule => + println( + rule.antecedent.mkString("[", ",", "]") + + " => " + rule.consequent .mkString("[", ",", "]") + + ", " + rule.confidence) +} {% endhighlight %} @@ -68,21 +83,38 @@ model.freqItemsets.collect().foreach { itemset => [`FPGrowth`](api/java/org/apache/spark/mllib/fpm/FPGrowth.html) implements the FP-growth algorithm. -It take an `RDD` of transactions, where each transaction is an `Array` of items of a generic type. +It take an `JavaRDD` of transactions, where each transaction is an `Iterable` of items of a generic type. Calling `FPGrowth.run` with transactions returns an [`FPGrowthModel`](api/java/org/apache/spark/mllib/fpm/FPGrowthModel.html) -that stores the frequent itemsets with their frequencies. +that stores the frequent itemsets with their frequencies. The following +example illustrates how to mine frequent itemsets and association rules +(see [Association +Rules](mllib-frequent-pattern-mining.html#association-rules) for +details) from `transactions`. {% highlight java %} +import java.util.Arrays; import java.util.List; -import com.google.common.base.Joiner; - import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.fpm.AssociationRules; import org.apache.spark.mllib.fpm.FPGrowth; import org.apache.spark.mllib.fpm.FPGrowthModel; -JavaRDD> transactions = ... +SparkConf conf = new SparkConf().setAppName("FP-growth Example"); +JavaSparkContext sc = new JavaSparkContext(conf); + +JavaRDD data = sc.textFile("data/mllib/sample_fpgrowth.txt"); + +JavaRDD> transactions = data.map( + new Function>() { + public List call(String line) { + String[] parts = line.split(" "); + return Arrays.asList(parts); + } + } +); FPGrowth fpg = new FPGrowth() .setMinSupport(0.2) @@ -90,9 +122,202 @@ FPGrowth fpg = new FPGrowth() FPGrowthModel model = fpg.run(transactions); for (FPGrowth.FreqItemset itemset: model.freqItemsets().toJavaRDD().collect()) { - System.out.println("[" + Joiner.on(",").join(s.javaItems()) + "], " + s.freq()); + System.out.println("[" + itemset.javaItems() + "], " + itemset.freq()); +} + +double minConfidence = 0.8; +for (AssociationRules.Rule rule + : model.generateAssociationRules(minConfidence).toJavaRDD().collect()) { + System.out.println( + rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence()); } {% endhighlight %} + +
+ +[`FPGrowth`](api/python/pyspark.mllib.html#pyspark.mllib.fpm.FPGrowth) implements the +FP-growth algorithm. +It take an `RDD` of transactions, where each transaction is an `List` of items of a generic type. +Calling `FPGrowth.train` with transactions returns an +[`FPGrowthModel`](api/python/pyspark.mllib.html#pyspark.mllib.fpm.FPGrowthModel) +that stores the frequent itemsets with their frequencies. + +{% highlight python %} +from pyspark.mllib.fpm import FPGrowth + +data = sc.textFile("data/mllib/sample_fpgrowth.txt") + +transactions = data.map(lambda line: line.strip().split(' ')) + +model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10) + +result = model.freqItemsets().collect() +for fi in result: + print(fi) +{% endhighlight %} +
+ + + +## Association Rules + +
+
+[AssociationRules](api/scala/index.html#org.apache.spark.mllib.fpm.AssociationRules) +implements a parallel rule generation algorithm for constructing rules +that have a single item as the consequent. + +{% highlight scala %} +import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.fpm.AssociationRules +import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset + +val freqItemsets = sc.parallelize(Seq( + new FreqItemset(Array("a"), 15L), + new FreqItemset(Array("b"), 35L), + new FreqItemset(Array("a", "b"), 12L) +)); + +val ar = new AssociationRules() + .setMinConfidence(0.8) +val results = ar.run(freqItemsets) + +results.collect().foreach { rule => + println("[" + rule.antecedent.mkString(",") + + "=>" + + rule.consequent.mkString(",") + "]," + rule.confidence) +} +{% endhighlight %} + +
+ +
+[AssociationRules](api/java/org/apache/spark/mllib/fpm/AssociationRules.html) +implements a parallel rule generation algorithm for constructing rules +that have a single item as the consequent. + +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.fpm.AssociationRules; +import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset; + +JavaRDD> freqItemsets = sc.parallelize(Arrays.asList( + new FreqItemset(new String[] {"a"}, 15L), + new FreqItemset(new String[] {"b"}, 35L), + new FreqItemset(new String[] {"a", "b"}, 12L) +)); + +AssociationRules arules = new AssociationRules() + .setMinConfidence(0.8); +JavaRDD> results = arules.run(freqItemsets); + +for (AssociationRules.Rule rule: results.collect()) { + System.out.println( + rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence()); +} +{% endhighlight %} + +
+
+ +## PrefixSpan + +PrefixSpan is a sequential pattern mining algorithm described in +[Pei et al., Mining Sequential Patterns by Pattern-Growth: The +PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer +the reader to the referenced paper for formalizing the sequential +pattern mining problem. + +MLlib's PrefixSpan implementation takes the following parameters: + +* `minSupport`: the minimum support required to be considered a frequent + sequential pattern. +* `maxPatternLength`: the maximum length of a frequent sequential + pattern. Any frequent pattern exceeding this length will not be + included in the results. +* `maxLocalProjDBSize`: the maximum number of items allowed in a + prefix-projected database before local iterative processing of the + projected databse begins. This parameter should be tuned with respect + to the size of your executors. + +**Examples** + +The following example illustrates PrefixSpan running on the sequences +(using same notation as Pei et al): + +~~~ + <(12)3> + <1(32)(12)> + <(12)5> + <6> +~~~ + +
+
+ +[`PrefixSpan`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) implements the +PrefixSpan algorithm. +Calling `PrefixSpan.run` returns a +[`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpanModel) +that stores the frequent sequences with their frequencies. + +{% highlight scala %} +import org.apache.spark.mllib.fpm.PrefixSpan + +val sequences = sc.parallelize(Seq( + Array(Array(1, 2), Array(3)), + Array(Array(1), Array(3, 2), Array(1, 2)), + Array(Array(1, 2), Array(5)), + Array(Array(6)) + ), 2).cache() +val prefixSpan = new PrefixSpan() + .setMinSupport(0.5) + .setMaxPatternLength(5) +val model = prefixSpan.run(sequences) +model.freqSequences.collect().foreach { freqSequence => +println( + freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq) +} +{% endhighlight %} + +
+ +
+ +[`PrefixSpan`](api/java/org/apache/spark/mllib/fpm/PrefixSpan.html) implements the +PrefixSpan algorithm. +Calling `PrefixSpan.run` returns a +[`PrefixSpanModel`](api/java/org/apache/spark/mllib/fpm/PrefixSpanModel.html) +that stores the frequent sequences with their frequencies. + +{% highlight java %} +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.mllib.fpm.PrefixSpan; +import org.apache.spark.mllib.fpm.PrefixSpanModel; + +JavaRDD>> sequences = sc.parallelize(Arrays.asList( + Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3)), + Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1, 2)), + Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5)), + Arrays.asList(Arrays.asList(6)) +), 2); +PrefixSpan prefixSpan = new PrefixSpan() + .setMinSupport(0.5) + .setMaxPatternLength(5); +PrefixSpanModel model = prefixSpan.run(sequences); +for (PrefixSpan.FreqSequence freqSeq: model.freqSequences().toJavaRDD().collect()) { + System.out.println(freqSeq.javaSequence() + ", " + freqSeq.freq()); +} +{% endhighlight %} + +
+
+ diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index eea864eacf7c..257f7cc7603f 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -5,37 +5,44 @@ displayTitle: Machine Learning Library (MLlib) Guide description: MLlib machine learning library overview for Spark SPARK_VERSION_SHORT --- -MLlib is Spark's scalable machine learning library consisting of common learning algorithms and utilities, -including classification, regression, clustering, collaborative -filtering, dimensionality reduction, as well as underlying optimization primitives. -Guides for individual algorithms are listed below. +MLlib is Spark's machine learning (ML) library. +Its goal is to make practical machine learning scalable and easy. +It consists of common learning algorithms and utilities, including classification, regression, +clustering, collaborative filtering, dimensionality reduction, as well as lower-level optimization +primitives and higher-level pipeline APIs. -The API is divided into 2 parts: +It divides into two packages: -* [The original `spark.mllib` API](mllib-guide.html#mllib-types-algorithms-and-utilities) is the primary API. -* [The "Pipelines" `spark.ml` API](mllib-guide.html#sparkml-high-level-apis-for-ml-pipelines) is a higher-level API for constructing ML workflows. +* [`spark.mllib`](mllib-guide.html#mllib-types-algorithms-and-utilities) contains the original API + built on top of [RDDs](programming-guide.html#resilient-distributed-datasets-rdds). +* [`spark.ml`](mllib-guide.html#sparkml-high-level-apis-for-ml-pipelines) provides higher-level API + built on top of [DataFrames](sql-programming-guide.html#dataframes) for constructing ML pipelines. -We list major functionality from both below, with links to detailed guides. +Using `spark.ml` is recommended because with DataFrames the API is more versatile and flexible. +But we will keep supporting `spark.mllib` along with the development of `spark.ml`. +Users should be comfortable using `spark.mllib` features and expect more features coming. +Developers should contribute new algorithms to `spark.ml` if they fit the ML pipeline concept well, +e.g., feature extractors and transformers. -# MLlib types, algorithms and utilities +We list major functionality from both below, with links to detailed guides. -This lists functionality included in `spark.mllib`, the main MLlib API. +# spark.mllib: data types, algorithms, and utilities * [Data types](mllib-data-types.html) * [Basic statistics](mllib-statistics.html) - * summary statistics - * correlations - * stratified sampling - * hypothesis testing - * random data generation + * [summary statistics](mllib-statistics.html#summary-statistics) + * [correlations](mllib-statistics.html#correlations) + * [stratified sampling](mllib-statistics.html#stratified-sampling) + * [hypothesis testing](mllib-statistics.html#hypothesis-testing) + * [random data generation](mllib-statistics.html#random-data-generation) * [Classification and regression](mllib-classification-regression.html) * [linear models (SVMs, logistic regression, linear regression)](mllib-linear-methods.html) * [naive Bayes](mllib-naive-bayes.html) * [decision trees](mllib-decision-tree.html) - * [ensembles of trees](mllib-ensembles.html) (Random Forests and Gradient-Boosted Trees) + * [ensembles of trees (Random Forests and Gradient-Boosted Trees)](mllib-ensembles.html) * [isotonic regression](mllib-isotonic-regression.html) * [Collaborative filtering](mllib-collaborative-filtering.html) - * alternating least squares (ALS) + * [alternating least squares (ALS)](mllib-collaborative-filtering.html#collaborative-filtering) * [Clustering](mllib-clustering.html) * [k-means](mllib-clustering.html#k-means) * [Gaussian mixture](mllib-clustering.html#gaussian-mixture) @@ -43,79 +50,76 @@ This lists functionality included in `spark.mllib`, the main MLlib API. * [latent Dirichlet allocation (LDA)](mllib-clustering.html#latent-dirichlet-allocation-lda) * [streaming k-means](mllib-clustering.html#streaming-k-means) * [Dimensionality reduction](mllib-dimensionality-reduction.html) - * singular value decomposition (SVD) - * principal component analysis (PCA) + * [singular value decomposition (SVD)](mllib-dimensionality-reduction.html#singular-value-decomposition-svd) + * [principal component analysis (PCA)](mllib-dimensionality-reduction.html#principal-component-analysis-pca) * [Feature extraction and transformation](mllib-feature-extraction.html) * [Frequent pattern mining](mllib-frequent-pattern-mining.html) - * FP-growth -* [Evaluation Metrics](mllib-evaluation-metrics.html) -* [Optimization (developer)](mllib-optimization.html) - * stochastic gradient descent - * limited-memory BFGS (L-BFGS) + * [FP-growth](mllib-frequent-pattern-mining.html#fp-growth) + * [association rules](mllib-frequent-pattern-mining.html#association-rules) + * [PrefixSpan](mllib-frequent-pattern-mining.html#prefix-span) +* [Evaluation metrics](mllib-evaluation-metrics.html) * [PMML model export](mllib-pmml-model-export.html) - -MLlib is under active development. -The APIs marked `Experimental`/`DeveloperApi` may change in future releases, -and the migration guide below will explain all changes between releases. +* [Optimization (developer)](mllib-optimization.html) + * [stochastic gradient descent](mllib-optimization.html#stochastic-gradient-descent-sgd) + * [limited-memory BFGS (L-BFGS)](mllib-optimization.html#limited-memory-bfgs-l-bfgs) # spark.ml: high-level APIs for ML pipelines -Spark 1.2 introduced a new package called `spark.ml`, which aims to provide a uniform set of -high-level APIs that help users create and tune practical machine learning pipelines. - -*Graduated from Alpha!* The Pipelines API is no longer an alpha component, although many elements of it are still `Experimental` or `DeveloperApi`. +**[spark.ml programming guide](ml-guide.html)** provides an overview of the Pipelines API and major +concepts. It also contains sections on using algorithms within the Pipelines API, for example: -Note that we will keep supporting and adding features to `spark.mllib` along with the -development of `spark.ml`. -Users should be comfortable using `spark.mllib` features and expect more features coming. -Developers should contribute new algorithms to `spark.mllib` and can optionally contribute -to `spark.ml`. - -More detailed guides for `spark.ml` include: - -* **[spark.ml programming guide](ml-guide.html)**: overview of the Pipelines API and major concepts -* [Feature transformers](ml-features.html): Details on transformers supported in the Pipelines API, including a few not in the lower-level `spark.mllib` API -* [Ensembles](ml-ensembles.html): Details on ensemble learning methods in the Pipelines API +* [Feature extraction, transformation, and selection](ml-features.html) +* [Decision trees for classification and regression](ml-decision-tree.html) +* [Ensembles](ml-ensembles.html) +* [Linear methods with elastic net regularization](ml-linear-methods.html) +* [Multilayer perceptron classifier](ml-ann.html) # Dependencies -MLlib uses the linear algebra package -[Breeze](http://www.scalanlp.org/), which depends on -[netlib-java](https://github.com/fommil/netlib-java) for optimised -numerical processing. If natives are not available at runtime, you -will see a warning message and a pure JVM implementation will be used -instead. +MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/), which depends on +[netlib-java](https://github.com/fommil/netlib-java) for optimised numerical processing. +If natives libraries[^1] are not available at runtime, you will see a warning message and a pure JVM +implementation will be used instead. -To learn more about the benefits and background of system optimised -natives, you may wish to watch Sam Halliday's ScalaX talk on -[High Performance Linear Algebra in Scala](http://fommil.github.io/scalax14/#/)). +Due to licensing issues with runtime proprietary binaries, we do not include `netlib-java`'s native +proxies by default. +To configure `netlib-java` / Breeze to use system optimised binaries, include +`com.github.fommil.netlib:all:1.1.2` (or build Spark with `-Pnetlib-lgpl`) as a dependency of your +project and read the [netlib-java](https://github.com/fommil/netlib-java) documentation for your +platform's additional installation instructions. -Due to licensing issues with runtime proprietary binaries, we do not -include `netlib-java`'s native proxies by default. To configure -`netlib-java` / Breeze to use system optimised binaries, include -`com.github.fommil.netlib:all:1.1.2` (or build Spark with -`-Pnetlib-lgpl`) as a dependency of your project and read the -[netlib-java](https://github.com/fommil/netlib-java) documentation for -your platform's additional installation instructions. +To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.4 or newer. -To use MLlib in Python, you will need [NumPy](http://www.numpy.org) -version 1.4 or newer. +[^1]: To learn more about the benefits and background of system optimised natives, you may wish to + watch Sam Halliday's ScalaX talk on [High Performance Linear Algebra in Scala](http://fommil.github.io/scalax14/#/). ---- +# Migration guide -# Migration Guide +MLlib is under active development. +The APIs marked `Experimental`/`DeveloperApi` may change in future releases, +and the migration guide below will explain all changes between releases. + +## From 1.4 to 1.5 -For the `spark.ml` package, please see the [spark.ml Migration Guide](ml-guide.html#migration-guide). +In the `spark.mllib` package, there are no break API changes but several behavior changes: -## From 1.3 to 1.4 +* [SPARK-9005](https://issues.apache.org/jira/browse/SPARK-9005): + `RegressionMetrics.explainedVariance` returns the average regression sum of squares. +* [SPARK-8600](https://issues.apache.org/jira/browse/SPARK-8600): `NaiveBayesModel.labels` become + sorted. +* [SPARK-3382](https://issues.apache.org/jira/browse/SPARK-3382): `GradientDescent` has a default + convergence tolerance `1e-3`, and hence iterations might end earlier than 1.4. -In the `spark.mllib` package, there were several breaking changes, but all in `DeveloperApi` or `Experimental` APIs: +In the `spark.ml` package, there exists one break API change and one behavior change: -* Gradient-Boosted Trees - * *(Breaking change)* The signature of the [`Loss.gradient`](api/scala/index.html#org.apache.spark.mllib.tree.loss.Loss) method was changed. This is only an issues for users who wrote their own losses for GBTs. - * *(Breaking change)* The `apply` and `copy` methods for the case class [`BoostingStrategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.BoostingStrategy) have been changed because of a modification to the case class fields. This could be an issue for users who use `BoostingStrategy` to set GBT parameters. -* *(Breaking change)* The return value of [`LDA.run`](api/scala/index.html#org.apache.spark.mllib.clustering.LDA) has changed. It now returns an abstract class `LDAModel` instead of the concrete class `DistributedLDAModel`. The object of type `LDAModel` can still be cast to the appropriate concrete type, which depends on the optimization algorithm. +* [SPARK-9268](https://issues.apache.org/jira/browse/SPARK-9268): Java's varargs support is removed + from `Params.setDefault` due to a + [Scala compiler bug](https://issues.scala-lang.org/browse/SI-9013). +* [SPARK-10097](https://issues.apache.org/jira/browse/SPARK-10097): `Evaluator.isLargerBetter` is + added to indicate metric ordering. Metrics like RMSE no longer flip signs as in 1.4. -## Previous Spark Versions +## Previous Spark versions Earlier migration guides are archived [on this page](mllib-migration-guides.html). + +--- diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md index 5732bc4c7e79..6aa881f74918 100644 --- a/docs/mllib-isotonic-regression.md +++ b/docs/mllib-isotonic-regression.md @@ -160,4 +160,39 @@ model.save(sc.sc(), "myModelPath"); IsotonicRegressionModel sameModel = IsotonicRegressionModel.load(sc.sc(), "myModelPath"); {% endhighlight %} + +
+Data are read from a file where each line has a format label,feature +i.e. 4710.28,500.00. The data are split to training and testing set. +Model is created using the training set and a mean squared error is calculated from the predicted +labels and real labels in the test set. + +{% highlight python %} +import math +from pyspark.mllib.regression import IsotonicRegression, IsotonicRegressionModel + +data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt") + +# Create label, feature, weight tuples from input data with weight set to default value 1.0. +parsedData = data.map(lambda line: tuple([float(x) for x in line.split(',')]) + (1.0,)) + +# Split data into training (60%) and test (40%) sets. +training, test = parsedData.randomSplit([0.6, 0.4], 11) + +# Create isotonic regression model from training data. +# Isotonic parameter defaults to true so it is only shown for demonstration +model = IsotonicRegression.train(training) + +# Create tuples of predicted and real labels. +predictionAndLabel = test.map(lambda p: (model.predict(p[1]), p[0])) + +# Calculate mean squared error between predicted and real labels. +meanSquaredError = predictionAndLabel.map(lambda pl: math.pow((pl[0] - pl[1]), 2)).mean() +print("Mean Squared Error = " + str(meanSquaredError)) + +# Save and load model +model.save(sc, "myModelPath") +sameModel = IsotonicRegressionModel.load(sc, "myModelPath") +{% endhighlight %} +
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md index 07655baa414b..e9b2d276cd38 100644 --- a/docs/mllib-linear-methods.md +++ b/docs/mllib-linear-methods.md @@ -504,7 +504,6 @@ will in the future. {% highlight python %} from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel from pyspark.mllib.regression import LabeledPoint -from numpy import array # Load and parse the data def parsePoint(line): @@ -676,7 +675,6 @@ Note that the Python API does not yet support model save/load but will in the fu {% highlight python %} from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel -from numpy import array # Load and parse the data def parsePoint(line): diff --git a/docs/mllib-migration-guides.md b/docs/mllib-migration-guides.md index 8df68d81f3c7..774b85d1f773 100644 --- a/docs/mllib-migration-guides.md +++ b/docs/mllib-migration-guides.md @@ -7,6 +7,25 @@ description: MLlib migration guides from before Spark SPARK_VERSION_SHORT The migration guide for the current Spark version is kept on the [MLlib Programming Guide main page](mllib-guide.html#migration-guide). +## From 1.3 to 1.4 + +In the `spark.mllib` package, there were several breaking changes, but all in `DeveloperApi` or `Experimental` APIs: + +* Gradient-Boosted Trees + * *(Breaking change)* The signature of the [`Loss.gradient`](api/scala/index.html#org.apache.spark.mllib.tree.loss.Loss) method was changed. This is only an issues for users who wrote their own losses for GBTs. + * *(Breaking change)* The `apply` and `copy` methods for the case class [`BoostingStrategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.BoostingStrategy) have been changed because of a modification to the case class fields. This could be an issue for users who use `BoostingStrategy` to set GBT parameters. +* *(Breaking change)* The return value of [`LDA.run`](api/scala/index.html#org.apache.spark.mllib.clustering.LDA) has changed. It now returns an abstract class `LDAModel` instead of the concrete class `DistributedLDAModel`. The object of type `LDAModel` can still be cast to the appropriate concrete type, which depends on the optimization algorithm. + +In the `spark.ml` package, several major API changes occurred, including: + +* `Param` and other APIs for specifying parameters +* `uid` unique IDs for Pipeline components +* Reorganization of certain classes + +Since the `spark.ml` API was an alpha component in Spark 1.3, we do not list all changes here. +However, since 1.4 `spark.ml` is no longer an alpha component, we will provide details on any API +changes for future releases. + ## From 1.2 to 1.3 In the `spark.mllib` package, there were several breaking changes. The first change (in `ALS`) is the only one in a component not marked as Alpha or Experimental. @@ -23,6 +42,17 @@ In the `spark.mllib` package, there were several breaking changes. The first ch * In linear regression (including Lasso and ridge regression), the squared loss is now divided by 2. So in order to produce the same result as in 1.2, the regularization parameter needs to be divided by 2 and the step size needs to be multiplied by 2. +In the `spark.ml` package, the main API changes are from Spark SQL. We list the most important changes here: + +* The old [SchemaRDD](http://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD) has been replaced with [DataFrame](api/scala/index.html#org.apache.spark.sql.DataFrame) with a somewhat modified API. All algorithms in Spark ML which used to use SchemaRDD now use DataFrame. +* In Spark 1.2, we used implicit conversions from `RDD`s of `LabeledPoint` into `SchemaRDD`s by calling `import sqlContext._` where `sqlContext` was an instance of `SQLContext`. These implicits have been moved, so we now call `import sqlContext.implicits._`. +* Java APIs for SQL have also changed accordingly. Please see the examples above and the [Spark SQL Programming Guide](sql-programming-guide.html) for details. + +Other changes were in `LogisticRegression`: + +* The `scoreCol` output column (with default value "score") was renamed to be `probabilityCol` (with default value "probability"). The type was originally `Double` (for the probability of class 1.0), but it is now `Vector` (for the probability of each class, to support multiclass classification in the future). +* In Spark 1.2, `LogisticRegressionModel` did not include an intercept. In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for [spark.mllib.LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS). The option to use an intercept will be added in the future. + ## From 1.1 to 1.2 The only API changes in MLlib v1.2 are in diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index be04d0b4b53a..6acfc71d7b01 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -438,22 +438,65 @@ run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstra and interpret the hypothesis tests. {% highlight scala %} -import org.apache.spark.SparkContext -import org.apache.spark.mllib.stat.Statistics._ +import org.apache.spark.mllib.stat.Statistics val data: RDD[Double] = ... // an RDD of sample data // run a KS test for the sample versus a standard normal distribution val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) println(testResult) // summary of the test including the p-value, test statistic, - // and null hypothesis - // if our p-value indicates significance, we can reject the null hypothesis + // and null hypothesis + // if our p-value indicates significance, we can reject the null hypothesis // perform a KS test using a cumulative distribution function of our making val myCDF: Double => Double = ... val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) {% endhighlight %} + +
+[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to +run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run +and interpret the hypothesis tests. + +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import org.apache.spark.mllib.stat.Statistics; +import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; + +JavaSparkContext jsc = ... +JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...)); +KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); +// summary of the test including the p-value, test statistic, +// and null hypothesis +// if our p-value indicates significance, we can reject the null hypothesis +System.out.println(testResult); +{% endhighlight %} +
+ +
+[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to +run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run +and interpret the hypothesis tests. + +{% highlight python %} +from pyspark.mllib.stat import Statistics + +parallelData = sc.parallelize([1.0, 2.0, ... ]) + +# run a KS test for the sample versus a standard normal distribution +testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) +print(testResult) # summary of the test including the p-value, test statistic, + # and null hypothesis + # if our p-value indicates significance, we can reject the null hypothesis +# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with +# a lambda to calculate the CDF is not made available in the Python API +{% endhighlight %} +
@@ -528,5 +571,82 @@ u = RandomRDDs.uniformRDD(sc, 1000000L, 10) v = u.map(lambda x: 1.0 + 2.0 * x) {% endhighlight %} + + +## Kernel density estimation + +[Kernel density estimation](https://en.wikipedia.org/wiki/Kernel_density_estimation) is a technique +useful for visualizing empirical probability distributions without requiring assumptions about the +particular distribution that the observed samples are drawn from. It computes an estimate of the +probability density function of a random variables, evaluated at a given set of points. It achieves +this estimate by expressing the PDF of the empirical distribution at a particular point as the the +mean of PDFs of normal distributions centered around each of the samples. + +
+ +
+[`KernelDensity`](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) provides methods +to compute kernel density estimates from an RDD of samples. The following example demonstrates how +to do so. + +{% highlight scala %} +import org.apache.spark.mllib.stat.KernelDensity +import org.apache.spark.rdd.RDD + +val data: RDD[Double] = ... // an RDD of sample data + +// Construct the density estimator with the sample data and a standard deviation for the Gaussian +// kernels +val kd = new KernelDensity() + .setSample(data) + .setBandwidth(3.0) + +// Find density estimates for the given values +val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) +{% endhighlight %} +
+ +
+[`KernelDensity`](api/java/index.html#org.apache.spark.mllib.stat.KernelDensity) provides methods +to compute kernel density estimates from an RDD of samples. The following example demonstrates how +to do so. + +{% highlight java %} +import org.apache.spark.mllib.stat.KernelDensity; +import org.apache.spark.rdd.RDD; + +RDD data = ... // an RDD of sample data + +// Construct the density estimator with the sample data and a standard deviation for the Gaussian +// kernels +KernelDensity kd = new KernelDensity() + .setSample(data) + .setBandwidth(3.0); + +// Find density estimates for the given values +double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); +{% endhighlight %} +
+ +
+[`KernelDensity`](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) provides methods +to compute kernel density estimates from an RDD of samples. The following example demonstrates how +to do so. + +{% highlight python %} +from pyspark.mllib.stat import KernelDensity + +data = ... # an RDD of sample data + +# Construct the density estimator with the sample data and a standard deviation for the Gaussian +# kernels +kd = KernelDensity() +kd.setSample(data) +kd.setBandwidth(3.0) + +# Find density estimates for the given values +densities = kd.estimate([-1.0, 2.0, 5.0]) +{% endhighlight %} +
diff --git a/docs/monitoring.md b/docs/monitoring.md index bcf885fe4e68..cedceb295802 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -48,7 +48,7 @@ follows: Environment VariableMeaning SPARK_DAEMON_MEMORY - Memory to allocate to the history server (default: 512m). + Memory to allocate to the history server (default: 1g). SPARK_DAEMON_JAVA_OPTS diff --git a/docs/programming-guide.md b/docs/programming-guide.md index ae712d62746f..4cf83bb39263 100644 --- a/docs/programming-guide.md +++ b/docs/programming-guide.md @@ -85,8 +85,8 @@ import org.apache.spark.SparkConf
-Spark {{site.SPARK_VERSION}} works with Python 2.6 or higher (but not Python 3). It uses the standard CPython interpreter, -so C libraries like NumPy can be used. +Spark {{site.SPARK_VERSION}} works with Python 2.6+ or Python 3.4+. It can use the standard CPython interpreter, +so C libraries like NumPy can be used. It also works with PyPy 2.3+. To run Spark applications in Python, use the `bin/spark-submit` script located in the Spark directory. This script will load Spark's Java/Scala libraries and allow you to submit applications to a cluster. @@ -104,6 +104,14 @@ Finally, you need to import some Spark classes into your program. Add the follow from pyspark import SparkContext, SparkConf {% endhighlight %} +PySpark requires the same minor version of Python in both driver and workers. It uses the default python version in PATH, +you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example: + +{% highlight bash %} +$ PYSPARK_PYTHON=python3.4 bin/pyspark +$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py +{% endhighlight %} +
@@ -541,7 +549,7 @@ returning only its answer to the driver program. If we also wanted to use `lineLengths` again later, we could add: {% highlight java %} -lineLengths.persist(); +lineLengths.persist(StorageLevel.MEMORY_ONLY()); {% endhighlight %} before the `reduce`, which would cause `lineLengths` to be saved in memory after the first time it is computed. diff --git a/docs/quick-start.md b/docs/quick-start.md index ce2cc9d2169c..d481fe0ea6d7 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -126,7 +126,7 @@ scala> val wordCounts = textFile.flatMap(line => line.split(" ")).map(word => (w wordCounts: spark.RDD[(String, Int)] = spark.ShuffledAggregatedRDD@71f027b8 {% endhighlight %} -Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations) and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (String, Int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action: +Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations), and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (String, Int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action: {% highlight scala %} scala> wordCounts.collect() @@ -163,7 +163,7 @@ One common data flow pattern is MapReduce, as popularized by Hadoop. Spark can i >>> wordCounts = textFile.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b) {% endhighlight %} -Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations) and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (string, int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action: +Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations), and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (string, int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action: {% highlight python %} >>> wordCounts.collect() @@ -217,13 +217,13 @@ a cluster, as described in the [programming guide](programming-guide.html#initia # Self-Contained Applications -Now say we wanted to write a self-contained application using the Spark API. We will walk through a -simple application in both Scala (with SBT), Java (with Maven), and Python. +Suppose we wish to write a self-contained application using the Spark API. We will walk through a +simple application in Scala (with sbt), Java (with Maven), and Python.
-We'll create a very simple Spark application in Scala. So simple, in fact, that it's +We'll create a very simple Spark application in Scala--so simple, in fact, that it's named `SimpleApp.scala`: {% highlight scala %} @@ -259,7 +259,7 @@ object which contains information about our application. Our application depends on the Spark API, so we'll also include an sbt configuration file, -`simple.sbt` which explains that Spark is a dependency. This file also adds a repository that +`simple.sbt`, which explains that Spark is a dependency. This file also adds a repository that Spark depends on: {% highlight scala %} @@ -302,7 +302,7 @@ Lines with a: 46, Lines with b: 23
-This example will use Maven to compile an application jar, but any similar build system will work. +This example will use Maven to compile an application JAR, but any similar build system will work. We'll create a very simple Spark application, `SimpleApp.java`: @@ -374,7 +374,7 @@ $ find . Now, we can package the application using Maven and execute it with `./bin/spark-submit`. {% highlight bash %} -# Package a jar containing your application +# Package a JAR containing your application $ mvn package ... [INFO] Building jar: {..}/{..}/target/simple-project-1.0.jar diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index debdd2adf22d..f36921ae30c2 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -45,7 +45,7 @@ frameworks. You can install Mesos either from source or using prebuilt packages To install Apache Mesos from source, follow these steps: 1. Download a Mesos release from a - [mirror](http://www.apache.org/dyn/closer.cgi/mesos/{{site.MESOS_VERSION}}/) + [mirror](http://www.apache.org/dyn/closer.lua/mesos/{{site.MESOS_VERSION}}/) 2. Follow the Mesos [Getting Started](http://mesos.apache.org/gettingstarted) page for compiling and installing Mesos @@ -216,6 +216,20 @@ node. Please refer to [Hadoop on Mesos](https://github.com/mesos/hadoop). In either case, HDFS runs separately from Hadoop MapReduce, without being scheduled through Mesos. +# Dynamic Resource Allocation with Mesos + +Mesos supports dynamic allocation only with coarse grain mode, which can resize the number of executors based on statistics +of the application. While dynamic allocation supports both scaling up and scaling down the number of executors, the coarse grain scheduler only supports scaling down +since it is already designed to run one executor per slave with the configured amount of resources. However, after scaling down the number of executors the coarse grain scheduler +can scale back up to the same amount of executors when Spark signals more executors are needed. + +Users that like to utilize this feature should launch the Mesos Shuffle Service that +provides shuffle data cleanup functionality on top of the Shuffle Service since Mesos doesn't yet support notifying another framework's +termination. To launch/stop the Mesos Shuffle Service please use the provided sbin/start-mesos-shuffle-service.sh and sbin/stop-mesos-shuffle-service.sh +scripts accordingly. + +The Shuffle Service is expected to be running on each slave node that will run Spark executors. One way to easily achieve this with Mesos +is to launch the Shuffle Service with Marathon with a unique host constraint. # Configuration @@ -306,6 +320,14 @@ See the [configuration page](configuration.html) for information on Spark config the final overhead will be this value. + + spark.mesos.uris + (none) + + A list of URIs to be downloaded to the sandbox when driver or executor is launched by Mesos. + This applies to both coarse-grain and fine-grain mode. + + spark.mesos.principal Framework principal to authenticate to Mesos diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index cac08a91b97d..ec32c419b7c5 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -199,7 +199,7 @@ If you need a reference to the proper location to put log files in the YARN so t spark.executor.instances 2 - The number of executors. Note that this property is incompatible with spark.dynamicAllocation.enabled. + The number of executors. Note that this property is incompatible with spark.dynamicAllocation.enabled. If both spark.dynamicAllocation.enabled and spark.executor.instances are specified, dynamic allocation is turned off and the specified number of spark.executor.instances is used. diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 4f71fbc086cd..2fe9ec3542b2 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -152,7 +152,7 @@ You can optionally configure the cluster further by setting environment variable SPARK_DAEMON_MEMORY - Memory to allocate to the Spark master and worker daemons themselves (default: 512m). + Memory to allocate to the Spark master and worker daemons themselves (default: 1g). SPARK_DAEMON_JAVA_OPTS diff --git a/docs/sparkr.md b/docs/sparkr.md index 4385a4eeacd5..7139d16b4a06 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -11,7 +11,8 @@ title: SparkR (R on Spark) SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. In Spark {{site.SPARK_VERSION}}, SparkR provides a distributed data frame implementation that supports operations like selection, filtering, aggregation etc. (similar to R data frames, -[dplyr](https://github.com/hadley/dplyr)) but on large datasets. +[dplyr](https://github.com/hadley/dplyr)) but on large datasets. SparkR also supports distributed +machine learning using MLlib. # SparkR DataFrames @@ -230,3 +231,37 @@ head(teenagers) {% endhighlight %}
+ +# Machine Learning + +SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of building a gaussian GLM model using SparkR. + +
+{% highlight r %} +# Create the DataFrame +df <- createDataFrame(sqlContext, iris) + +# Fit a linear model over the dataset. +model <- glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = "gaussian") + +# Model coefficients are returned in a similar format to R's native glm(). +summary(model) +##$coefficients +## Estimate +##(Intercept) 2.2513930 +##Sepal_Width 0.8035609 +##Species_versicolor 1.4587432 +##Species_virginica 1.9468169 + +# Make predictions based on the model. +predictions <- predict(model, newData = df) +head(select(predictions, "Sepal_Length", "prediction")) +## Sepal_Length prediction +##1 5.1 5.063856 +##2 4.9 4.662076 +##3 4.7 4.822788 +##4 4.6 4.742432 +##5 5.0 5.144212 +##6 5.4 5.385281 +{% endhighlight %} +
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 3ea77e82422f..a0b911d20724 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -11,7 +11,7 @@ title: Spark SQL and DataFrames Spark SQL is a Spark module for structured data processing. It provides a programming abstraction called DataFrames and can also act as distributed SQL query engine. -For how to enable Hive support, please refer to the [Hive Tables](#hive-tables) section. +Spark SQL can also be used to read data from an existing Hive installation. For more on how to configure this feature, please refer to the [Hive Tables](#hive-tables) section. # DataFrames @@ -28,7 +28,7 @@ All of the examples on this page use sample data included in the Spark distribut
The entry point into all functionality in Spark SQL is the -[`SQLContext`](api/scala/index.html#org.apache.spark.sql.`SQLContext`) class, or one of its +[`SQLContext`](api/scala/index.html#org.apache.spark.sql.SQLContext) class, or one of its descendants. To create a basic `SQLContext`, all you need is a SparkContext. {% highlight scala %} @@ -213,6 +213,11 @@ df.groupBy("age").count().show() // 30 1 {% endhighlight %} +For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/scala/index.html#org.apache.spark.sql.DataFrame). + +In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the [DataFrame Function Reference](api/scala/index.html#org.apache.spark.sql.DataFrame). + +
@@ -263,6 +268,10 @@ df.groupBy("age").count().show(); // 30 1 {% endhighlight %} +For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/java/org/apache/spark/sql/DataFrame.html). + +In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the [DataFrame Function Reference](api/java/org/apache/spark/sql/functions.html). +
@@ -320,6 +329,10 @@ df.groupBy("age").count().show() {% endhighlight %} +For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/python/pyspark.sql.html#pyspark.sql.DataFrame). + +In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the [DataFrame Function Reference](api/python/pyspark.sql.html#module-pyspark.sql.functions). +
@@ -370,10 +383,13 @@ showDF(count(groupBy(df, "age"))) {% endhighlight %} -
+For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/R/index.html). + +In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the [DataFrame Function Reference](api/R/index.html).
+ ## Running SQL Queries Programmatically @@ -870,12 +886,11 @@ saveDF(select(df, "name", "age"), "namesAndAges.parquet", "parquet") Save operations can optionally take a `SaveMode`, that specifies how to handle existing data if present. It is important to realize that these save modes do not utilize any locking and are not -atomic. Thus, it is not safe to have multiple writers attempting to write to the same location. -Additionally, when performing a `Overwrite`, the data will be deleted before writing out the +atomic. Additionally, when performing a `Overwrite`, the data will be deleted before writing out the new data. - + @@ -1124,6 +1139,13 @@ a simple schema, and gradually add more columns to the schema as needed. In thi up with multiple Parquet files with different but mutually compatible schemas. The Parquet data source is now able to automatically detect this case and merge schemas of all these files. +Since schema merging is a relatively expensive operation, and is not a necessity in most cases, we +turned it off by default starting from 1.5.0. You may enable it by + +1. setting data source option `mergeSchema` to `true` when reading Parquet files (as shown in the + examples below), or +2. setting the global SQL option `spark.sql.parquet.mergeSchema` to `true`. +
@@ -1143,7 +1165,7 @@ val df2 = sc.makeRDD(6 to 10).map(i => (i, i * 3)).toDF("single", "triple") df2.write.parquet("data/test_table/key=2") // Read the partitioned table -val df3 = sqlContext.read.parquet("data/test_table") +val df3 = sqlContext.read.option("mergeSchema", "true").parquet("data/test_table") df3.printSchema() // The final schema consists of all 3 columns in the Parquet files together @@ -1165,16 +1187,16 @@ df3.printSchema() # Create a simple DataFrame, stored into a partition directory df1 = sqlContext.createDataFrame(sc.parallelize(range(1, 6))\ .map(lambda i: Row(single=i, double=i * 2))) -df1.save("data/test_table/key=1", "parquet") +df1.write.parquet("data/test_table/key=1") # Create another DataFrame in a new partition directory, # adding a new column and dropping an existing column df2 = sqlContext.createDataFrame(sc.parallelize(range(6, 11)) .map(lambda i: Row(single=i, triple=i * 3))) -df2.save("data/test_table/key=2", "parquet") +df2.write.parquet("data/test_table/key=2") # Read the partitioned table -df3 = sqlContext.load("data/test_table", "parquet") +df3 = sqlContext.read.option("mergeSchema", "true").parquet("data/test_table") df3.printSchema() # The final schema consists of all 3 columns in the Parquet files together @@ -1201,7 +1223,7 @@ saveDF(df1, "data/test_table/key=1", "parquet", "overwrite") saveDF(df2, "data/test_table/key=2", "parquet", "overwrite") # Read the partitioned table -df3 <- loadDF(sqlContext, "data/test_table", "parquet") +df3 <- loadDF(sqlContext, "data/test_table", "parquet", mergeSchema="true") printSchema(df3) # The final schema consists of all 3 columns in the Parquet files together @@ -1301,7 +1323,7 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
@@ -1310,8 +1332,7 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext` @@ -1355,6 +1376,9 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`

Note:

    +
  • + This option is automatically ignored if spark.speculation is turned on. +
  • This option must be set via Hadoop Configuration rather than Spark SQLConf. @@ -1371,6 +1395,16 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`

    +
+ + + +
Scala/JavaPythonMeaning
Scala/JavaAny LanguageMeaning
SaveMode.ErrorIfExists (default) "error" (default)spark.sql.parquet.binaryAsString false - Some other Parquet-producing systems, in particular Impala and older versions of Spark SQL, do + Some other Parquet-producing systems, in particular Impala, Hive, and older versions of Spark SQL, do not differentiate between binary data and strings when writing out the Parquet schema. This flag tells Spark SQL to interpret binary data as a string to provide compatibility with these systems. spark.sql.parquet.int96AsTimestamp true - Some Parquet-producing systems, in particular Impala, store Timestamp into INT96. Spark would also - store Timestamp as INT96 because we need to avoid precision lost of the nanoseconds field. This + Some Parquet-producing systems, in particular Impala and Hive, store Timestamp into INT96. This flag tells Spark SQL to interpret INT96 data as a timestamp to provide compatibility with these systems.
spark.sql.parquet.mergeSchemafalse +

+ When true, the Parquet data source merges schemas collected from all data files, otherwise the + schema is picked from the summary file or a random data file if no summary file is available. +

+
## JSON Datasets @@ -1642,21 +1676,21 @@ results <- collect(sql(sqlContext, "FROM src SELECT key, value")) ### Interacting with Different Versions of Hive Metastore One of the most important pieces of Spark SQL's Hive support is interaction with Hive metastore, -which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary build of Spark SQL can be used to query different versions of Hive metastores, using the configuration described below. +which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary +build of Spark SQL can be used to query different versions of Hive metastores, using the configuration described below. +Note that independent of the version of Hive that is being used to talk to the metastore, internally Spark SQL +will compile against Hive 1.2.1 and use those classes for internal execution (serdes, UDFs, UDAFs, etc). -Internally, Spark SQL uses two Hive clients, one for executing native Hive commands like `SET` -and `DESCRIBE`, the other dedicated for communicating with Hive metastore. The former uses Hive -jars of version 0.13.1, which are bundled with Spark 1.4.0. The latter uses Hive jars of the -version specified by users. An isolated classloader is used here to avoid dependency conflicts. +The following options can be used to configure the version of Hive that is used to retrieve metadata: - + @@ -1667,12 +1701,16 @@ version specified by users. An isolated classloader is used here to avoid depend property can be one of three options:
  1. builtin
  2. - Use Hive 0.13.1, which is bundled with the Spark assembly jar when -Phive is + Use Hive 1.2.1, which is bundled with the Spark assembly jar when -Phive is enabled. When this option is chosen, spark.sql.hive.metastore.version must be - either 0.13.1 or not defined. + either 1.2.1 or not defined.
  3. maven
  4. - Use Hive jars of specified version downloaded from Maven repositories. -
  5. A classpath in the standard format for both Hive and Hadoop.
  6. + Use Hive jars of specified version downloaded from Maven repositories. This configuration + is not generally recommended for production deployments. +
  7. A classpath in the standard format for the JVM. This classpath must include all of Hive + and its dependencies, including the correct version of Hadoop. These jars only need to be + present on the driver, but if you are running in yarn cluster mode then you must ensure + they are packaged with you application.
@@ -1884,11 +1922,11 @@ that these options will be deprecated in future release as more optimizations ar - + @@ -1988,6 +2026,34 @@ options. # Migration Guide +## Upgrading From Spark SQL 1.4 to 1.5 + + - Optimized execution using manually managed memory (Tungsten) is now enabled by default, along with + code generation for expression evaluation. These features can both be disabled by setting + `spark.sql.tungsten.enabled` to `false. + - Parquet schema merging is no longer enabled by default. It can be re-enabled by setting + `spark.sql.parquet.mergeSchema` to `true`. + - Resolution of strings to columns in python now supports using dots (`.`) to qualify the column or + access nested values. For example `df['table.column.nestedField']`. However, this means that if + your column name contains any dots you must now escape them using backticks (e.g., ``table.`column.with.dots`.nested``). + - In-memory columnar storage partition pruning is on by default. It can be disabled by setting + `spark.sql.inMemoryColumnarStorage.partitionPruning` to `false`. + - Unlimited precision decimal columns are no longer supported, instead Spark SQL enforces a maximum + precision of 38. When inferring schema from `BigDecimal` objects, a precision of (38, 18) is now + used. When no precision is specified in DDL then the default remains `Decimal(10, 0)`. + - Timestamps are now stored at a precision of 1us, rather than 1ns + - In the `sql` dialect, floating point numbers are now parsed as decimal. HiveQL parsing remains + unchanged. + - The canonical name of SQL/DataFrame functions are now lower case (e.g. sum vs SUM). + - It has been determined that using the DirectOutputCommitter when speculation is enabled is unsafe + and thus this output committer will not be used when speculation is on, independent of configuration. + - JSON data source will not automatically load new files that are created by other applications + (i.e. files that are not inserted to the dataset through Spark SQL). + For a JSON persistent table (i.e. the metadata of the table is stored in Hive Metastore), + users can use `REFRESH TABLE` SQL command or `HiveContext`'s `refreshTable` method + to include those new files to the table. For a DataFrame representing a JSON dataset, users need to recreate + the DataFrame and the new DataFrame will include new files. + ## Upgrading from Spark SQL 1.3 to 1.4 #### DataFrame data reader/writer interface @@ -2009,7 +2075,8 @@ See the API docs for `SQLContext.read` ( #### DataFrame.groupBy retains grouping columns -Based on user feedback, we changed the default behavior of `DataFrame.groupBy().agg()` to retain the grouping columns in the resulting `DataFrame`. To keep the behavior in 1.3, set `spark.sql.retainGroupColumns` to `false`. +Based on user feedback, we changed the default behavior of `DataFrame.groupBy().agg()` to retain the +grouping columns in the resulting `DataFrame`. To keep the behavior in 1.3, set `spark.sql.retainGroupColumns` to `false`.
@@ -2146,7 +2213,7 @@ Python UDF registration is unchanged. When using DataTypes in Python you will need to construct them (i.e. `StringType()`) instead of referencing a singleton. -## Migration Guide for Shark User +## Migration Guide for Shark Users ### Scheduling To set a [Fair Scheduler](job-scheduling.html#fair-scheduler-pools) pool for a JDBC client session, @@ -2222,6 +2289,7 @@ Spark SQL supports the vast majority of Hive features, such as: * User defined functions (UDF) * User defined aggregation functions (UDAF) * User defined serialization formats (SerDes) +* Window functions * Joins * `JOIN` * `{LEFT|RIGHT|FULL} OUTER JOIN` @@ -2232,7 +2300,7 @@ Spark SQL supports the vast majority of Hive features, such as: * `SELECT col FROM ( SELECT a + b AS col from t1) t2` * Sampling * Explain -* Partitioned tables +* Partitioned tables including dynamic partition insertion * View * All Hive DDL Functions, including: * `CREATE TABLE` @@ -2294,8 +2362,9 @@ releases of Spark SQL. Hive can optionally merge the small files into fewer large files to avoid overflowing the HDFS metadata. Spark SQL does not support that. +# Reference -# Data Types +## Data Types Spark SQL and DataFrames support the following data types: @@ -2908,3 +2977,13 @@ from pyspark.sql.types import *
+## NaN Semantics + +There is specially handling for not-a-number (NaN) when dealing with `float` or `double` types that +does not exactly match standard floating point semantics. +Specifically: + + - NaN = NaN returns true. + - In aggregations all NaN values are grouped together. + - NaN is treated as a normal value in join keys. + - NaN values go last when in ascending order, larger than any other numeric value. diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md index de0461010dae..383d954409ce 100644 --- a/docs/streaming-flume-integration.md +++ b/docs/streaming-flume-integration.md @@ -5,8 +5,6 @@ title: Spark Streaming + Flume Integration Guide [Apache Flume](https://flume.apache.org/) is a distributed, reliable, and available service for efficiently collecting, aggregating, and moving large amounts of log data. Here we explain how to configure Flume and Spark Streaming to receive data from Flume. There are two approaches to this. -Python API Flume is not yet available in the Python API. - ## Approach 1: Flume-style Push-based Approach Flume is designed to push data between Flume agents. In this approach, Spark Streaming essentially sets up a receiver that acts an Avro agent for Flume, to which Flume can push the data. Here are the configuration steps. diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md index 775d508d4879..5db39ae54a27 100644 --- a/docs/streaming-kafka-integration.md +++ b/docs/streaming-kafka-integration.md @@ -82,7 +82,7 @@ This approach has the following advantages over the receiver-based approach (i.e - *Efficiency:* Achieving zero-data loss in the first approach required the data to be stored in a Write Ahead Log, which further replicated the data. This is actually inefficient as the data effectively gets replicated twice - once by Kafka, and a second time by the Write Ahead Log. This second approach eliminates the problem as there is no receiver, and hence no need for Write Ahead Logs. As long as you have sufficient Kafka retention, messages can be recovered from Kafka. -- *Exactly-once semantics:* The first approach uses Kafka's high level API to store consumed offsets in Zookeeper. This is traditionally the way to consume data from Kafka. While this approach (in combination with write ahead logs) can ensure zero data loss (i.e. at-least once semantics), there is a small chance some records may get consumed twice under some failures. This occurs because of inconsistencies between data reliably received by Spark Streaming and offsets tracked by Zookeeper. Hence, in this second approach, we use simple Kafka API that does not use Zookeeper. Offsets are tracked by Spark Streaming within its checkpoints. This eliminates inconsistencies between Spark Streaming and Zookeeper/Kafka, and so each record is received by Spark Streaming effectively exactly once despite failures. In order to achieve exactly-once semantics for output of your results, your output operation that saves the data to an external data store must be either idempotent, or an atomic transaction that saves results and offsets (see [Semanitcs of output operations](streaming-programming-guide.html#semantics-of-output-operations) in the main programming guide for further information). +- *Exactly-once semantics:* The first approach uses Kafka's high level API to store consumed offsets in Zookeeper. This is traditionally the way to consume data from Kafka. While this approach (in combination with write ahead logs) can ensure zero data loss (i.e. at-least once semantics), there is a small chance some records may get consumed twice under some failures. This occurs because of inconsistencies between data reliably received by Spark Streaming and offsets tracked by Zookeeper. Hence, in this second approach, we use simple Kafka API that does not use Zookeeper. Offsets are tracked by Spark Streaming within its checkpoints. This eliminates inconsistencies between Spark Streaming and Zookeeper/Kafka, and so each record is received by Spark Streaming effectively exactly once despite failures. In order to achieve exactly-once semantics for output of your results, your output operation that saves the data to an external data store must be either idempotent, or an atomic transaction that saves results and offsets (see [Semantics of output operations](streaming-programming-guide.html#semantics-of-output-operations) in the main programming guide for further information). Note that one disadvantage of this approach is that it does not update offsets in Zookeeper, hence Zookeeper-based Kafka monitoring tools will not show progress. However, you can access the offsets processed by this approach in each batch and update Zookeeper yourself (see below). @@ -152,7 +152,7 @@ Next, we discuss how to use this approach in your streaming application.
// Hold a reference to the current offset ranges, so it can be used downstream - final AtomicReference offsetRanges = new AtomicReference(); + final AtomicReference offsetRanges = new AtomicReference<>(); directKafkaStream.transformToPair( new Function, JavaPairRDD>() { diff --git a/docs/streaming-kinesis-integration.md b/docs/streaming-kinesis-integration.md index a7bcaec6fcd8..238a911a9199 100644 --- a/docs/streaming-kinesis-integration.md +++ b/docs/streaming-kinesis-integration.md @@ -91,7 +91,7 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m - Kinesis data processing is ordered per partition and occurs at-least once per message. - - Multiple applications can read from the same Kinesis stream. Kinesis will maintain the application-specific shard and checkpoint info in DynamodDB. + - Multiple applications can read from the same Kinesis stream. Kinesis will maintain the application-specific shard and checkpoint info in DynamoDB. - A single Kinesis stream shard is processed by one input DStream at a time. diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index 4663b3f14c52..c751dbb41785 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -50,13 +50,7 @@ all of which are presented in this guide. You will find tabs throughout this guide that let you choose between code snippets of different languages. -**Note:** Python API for Spark Streaming has been introduced in Spark 1.2. It has all the DStream -transformations and almost all the output operations available in Scala and Java interfaces. -However, it only has support for basic sources like text files and text data over sockets. -APIs for additional sources, like Kafka and Flume, will be available in the future. -Further information about available features in the Python API are mentioned throughout this -document; look out for the tag -Python API. +**Note:** There are a few APIs that are either different or not available in Python. Throughout this guide, you will find the tag Python API highlighting these differences. *************************************************************************************************** @@ -683,7 +677,7 @@ for Java, and [StreamingContext](api/python/pyspark.streaming.html#pyspark.strea {:.no_toc} Python API As of Spark {{site.SPARK_VERSION_SHORT}}, -out of these sources, *only* Kafka and Flume are available in the Python API. We will add more advanced sources in the Python API in future. +out of these sources, Kafka, Kinesis, Flume and MQTT are available in the Python API. This category of sources require interfacing with external non-Spark libraries, some of them with complex dependencies (e.g., Kafka and Flume). Hence, to minimize issues related to version conflicts @@ -725,9 +719,9 @@ Some of these advanced sources are as follows. - **Kafka:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kafka 0.8.2.1. See the [Kafka Integration Guide](streaming-kafka-integration.html) for more details. -- **Flume:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Flume 1.4.0. See the [Flume Integration Guide](streaming-flume-integration.html) for more details. +- **Flume:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Flume 1.6.0. See the [Flume Integration Guide](streaming-flume-integration.html) for more details. -- **Kinesis:** See the [Kinesis Integration Guide](streaming-kinesis-integration.html) for more details. +- **Kinesis:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kinesis Client Library 1.2.1. See the [Kinesis Integration Guide](streaming-kinesis-integration.html) for more details. - **Twitter:** Spark Streaming's TwitterUtils uses Twitter4j 3.0.3 to get the public stream of tweets using [Twitter's Streaming API](https://dev.twitter.com/docs/streaming-apis). Authentication information @@ -1141,7 +1135,7 @@ val joinedStream = stream1.join(stream2) {% highlight java %} JavaPairDStream stream1 = ... JavaPairDStream stream2 = ... -JavaPairDStream joinedStream = stream1.join(stream2); +JavaPairDStream> joinedStream = stream1.join(stream2); {% endhighlight %}
@@ -1166,7 +1160,7 @@ val joinedStream = windowedStream1.join(windowedStream2) {% highlight java %} JavaPairDStream windowedStream1 = stream1.window(Durations.seconds(20)); JavaPairDStream windowedStream2 = stream2.window(Durations.minutes(1)); -JavaPairDStream joinedStream = windowedStream1.join(windowedStream2); +JavaPairDStream> joinedStream = windowedStream1.join(windowedStream2); {% endhighlight %}
@@ -1702,7 +1696,7 @@ context.awaitTermination(); If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data. If the directory does not exist (i.e., running for the first time), then the function `contextFactory` will be called to create a new -context and set up the DStreams. See the Scala example +context and set up the DStreams. See the Java example [JavaRecoverableNetworkWordCount]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaRecoverableNetworkWordCount.java). This example appends the word counts of network data into a file. @@ -1813,7 +1807,7 @@ To run a Spark Streaming applications, you need to have the following. + *Mesos* - [Marathon](https://github.com/mesosphere/marathon) has been used to achieve this with Mesos. -- *[Since Spark 1.2] Configuring write ahead logs* - Since Spark 1.2, +- *Configuring write ahead logs* - Since Spark 1.2, we have introduced _write ahead logs_ for achieving strong fault-tolerance guarantees. If enabled, all the data received from a receiver gets written into a write ahead log in the configuration checkpoint directory. This prevents data loss on driver @@ -1828,6 +1822,17 @@ To run a Spark Streaming applications, you need to have the following. stored in a replicated storage system. This can be done by setting the storage level for the input stream to `StorageLevel.MEMORY_AND_DISK_SER`. +- *Setting the max receiving rate* - If the cluster resources is not large enough for the streaming + application to process data as fast as it is being received, the receivers can be rate limited + by setting a maximum rate limit in terms of records / sec. + See the [configuration parameters](configuration.html#spark-streaming) + `spark.streaming.receiver.maxRate` for receivers and `spark.streaming.kafka.maxRatePerPartition` + for Direct Kafka approach. In Spark 1.5, we have introduced a feature called *backpressure* that + eliminate the need to set this rate limit, as Spark Streaming automatically figures out the + rate limits and dynamically adjusts them if the processing conditions change. This backpressure + can be enabled by setting the [configuration parameter](configuration.html#spark-streaming) + `spark.streaming.backpressure.enabled` to `true`. + ### Upgrading Application Code {:.no_toc} diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md index e58645274e52..7ea4d6f1a3f8 100644 --- a/docs/submitting-applications.md +++ b/docs/submitting-applications.md @@ -65,8 +65,8 @@ For Python applications, simply pass a `.py` file in the place of ` org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index 9df26ffca577..a377694507d2 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -124,7 +124,7 @@ public String uid() { /** * Param for max number of iterations - *

+ *

* NOTE: The usual way to add a parameter to a model or algorithm is to include: * - val myParamName: ParamType * - def getMyParamName @@ -222,7 +222,7 @@ public Vector predictRaw(Vector features) { /** * Create a copy of the model. * The copy is shallow, except for the embedded paramMap, which gets a deep copy. - *

+ *

* This is used for the defaul implementation of [[transform()]]. * * In Java, we have to make this method public since Java does not understand Scala's protected @@ -230,6 +230,7 @@ public Vector predictRaw(Vector features) { */ @Override public MyJavaLogisticRegressionModel copy(ParamMap extra) { - return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), extra); + return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), extra) + .setParent(parent()); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java index 75063dbf800d..e7f2f6f61507 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java @@ -178,6 +178,7 @@ private static Params parse(String[] args) { return params; } + @SuppressWarnings("static") private static Options generateCommandlineOptions() { Option input = OptionBuilder.withArgName("input") .hasArg() diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java index dac649d1d5ae..94beeced3d47 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java @@ -77,7 +77,8 @@ public static void main(String[] args) { ParamMap paramMap = new ParamMap(); paramMap.put(lr.maxIter().w(20)); // Specify 1 Param. paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter. - paramMap.put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params. + double thresholds[] = {0.45, 0.55}; + paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params. // One can also combine ParamMaps. ParamMap paramMap2 = new ParamMap(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java new file mode 100644 index 000000000000..23f834ab4332 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.evaluation.RegressionEvaluator; +import org.apache.spark.ml.param.ParamMap; +import org.apache.spark.ml.regression.LinearRegression; +import org.apache.spark.ml.tuning.*; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.SQLContext; + +/** + * A simple example demonstrating model selection using TrainValidationSplit. + * + * The example is based on {@link org.apache.spark.examples.ml.JavaSimpleParamsExample} + * using linear regression. + * + * Run with + * {{{ + * bin/run-example ml.JavaTrainValidationSplitExample + * }}} + */ +public class JavaTrainValidationSplitExample { + + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("JavaTrainValidationSplitExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + SQLContext jsql = new SQLContext(jsc); + + DataFrame data = jsql.createDataFrame( + MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"), + LabeledPoint.class); + + // Prepare training and test data. + DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345); + DataFrame training = splits[0]; + DataFrame test = splits[1]; + + LinearRegression lr = new LinearRegression(); + + // We use a ParamGridBuilder to construct a grid of parameters to search over. + // TrainValidationSplit will try all combinations of values and determine best model using + // the evaluator. + ParamMap[] paramGrid = new ParamGridBuilder() + .addGrid(lr.regParam(), new double[] {0.1, 0.01}) + .addGrid(lr.fitIntercept()) + .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.5, 1.0}) + .build(); + + // In this case the estimator is simply the linear regression. + // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. + TrainValidationSplit trainValidationSplit = new TrainValidationSplit() + .setEstimator(lr) + .setEvaluator(new RegressionEvaluator()) + .setEstimatorParamMaps(paramGrid); + + // 80% of the data will be used for training and the remaining 20% for validation. + trainValidationSplit.setTrainRatio(0.8); + + // Run train validation split, and choose the best set of parameters. + TrainValidationSplitModel model = trainValidationSplit.fit(training); + + // Make predictions on test data. model is the model with combination of parameters + // that performed best. + model.transform(test) + .select("features", "label", "prediction") + .show(); + + jsc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java index dbf2ef02d7b7..99b63a2590ae 100644 --- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java +++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java @@ -45,7 +45,7 @@ * Usage: JavaStatefulNetworkWordCount * and describe the TCP server that Spark Streaming would connect to receive * data. - *

+ *

* To run this on your local machine, you need to first run a Netcat server * `$ nc -lk 9999` * and then run the example @@ -85,7 +85,7 @@ public Optional call(List values, Optional state) { @SuppressWarnings("unchecked") List> tuples = Arrays.asList(new Tuple2("hello", 1), new Tuple2("world", 1)); - JavaPairRDD initialRDD = ssc.sc().parallelizePairs(tuples); + JavaPairRDD initialRDD = ssc.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream lines = ssc.socketTextStream( args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER_2); @@ -107,7 +107,7 @@ public Tuple2 call(String s) { // This will give a Dstream made of state (which is the cumulative count of the words) JavaPairDStream stateDstream = wordsDstream.updateStateByKey(updateFunction, - new HashPartitioner(ssc.sc().defaultParallelism()), initialRDD); + new HashPartitioner(ssc.sparkContext().defaultParallelism()), initialRDD); stateDstream.print(); ssc.start(); diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py index a9f29dab2d60..2d6d115d54d0 100644 --- a/examples/src/main/python/ml/simple_params_example.py +++ b/examples/src/main/python/ml/simple_params_example.py @@ -70,7 +70,7 @@ # We may alternatively specify parameters using a parameter map. # paramMap overrides all lr parameters set earlier. - paramMap = {lr.maxIter: 20, lr.threshold: 0.55, lr.probabilityCol: "myProbability"} + paramMap = {lr.maxIter: 20, lr.thresholds: [0.45, 0.55], lr.probabilityCol: "myProbability"} # Now learn a new model using the new parameters. model2 = lr.fit(training, paramMap) diff --git a/examples/src/main/python/streaming/direct_kafka_wordcount.py b/examples/src/main/python/streaming/direct_kafka_wordcount.py index 6ef188a220c5..ea20678b9aca 100644 --- a/examples/src/main/python/streaming/direct_kafka_wordcount.py +++ b/examples/src/main/python/streaming/direct_kafka_wordcount.py @@ -23,8 +23,8 @@ http://kafka.apache.org/documentation.html#quickstart and then run the example - `$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\ - spark-streaming-kafka-assembly-*.jar \ + `$ bin/spark-submit --jars \ + external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \ examples/src/main/python/streaming/direct_kafka_wordcount.py \ localhost:9092 test` """ @@ -37,7 +37,7 @@ if __name__ == "__main__": if len(sys.argv) != 3: - print >> sys.stderr, "Usage: direct_kafka_wordcount.py " + print("Usage: direct_kafka_wordcount.py ", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount") diff --git a/examples/src/main/python/streaming/flume_wordcount.py b/examples/src/main/python/streaming/flume_wordcount.py index 091b64d8c4af..d75bc6daac13 100644 --- a/examples/src/main/python/streaming/flume_wordcount.py +++ b/examples/src/main/python/streaming/flume_wordcount.py @@ -23,8 +23,9 @@ https://flume.apache.org/documentation.html and then run the example - `$ bin/spark-submit --jars external/flume-assembly/target/scala-*/\ - spark-streaming-flume-assembly-*.jar examples/src/main/python/streaming/flume_wordcount.py \ + `$ bin/spark-submit --jars \ + external/flume-assembly/target/scala-*/spark-streaming-flume-assembly-*.jar \ + examples/src/main/python/streaming/flume_wordcount.py \ localhost 12345 """ from __future__ import print_function diff --git a/examples/src/main/python/streaming/kafka_wordcount.py b/examples/src/main/python/streaming/kafka_wordcount.py index b178e7899b5e..8d697f620f46 100644 --- a/examples/src/main/python/streaming/kafka_wordcount.py +++ b/examples/src/main/python/streaming/kafka_wordcount.py @@ -23,8 +23,9 @@ http://kafka.apache.org/documentation.html#quickstart and then run the example - `$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\ - spark-streaming-kafka-assembly-*.jar examples/src/main/python/streaming/kafka_wordcount.py \ + `$ bin/spark-submit --jars \ + external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \ + examples/src/main/python/streaming/kafka_wordcount.py \ localhost:2181 test` """ from __future__ import print_function diff --git a/examples/src/main/python/streaming/mqtt_wordcount.py b/examples/src/main/python/streaming/mqtt_wordcount.py new file mode 100644 index 000000000000..abf9c0e21d30 --- /dev/null +++ b/examples/src/main/python/streaming/mqtt_wordcount.py @@ -0,0 +1,59 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" + A sample wordcount with MqttStream stream + Usage: mqtt_wordcount.py + + To run this in your local machine, you need to setup a MQTT broker and publisher first, + Mosquitto is one of the open source MQTT Brokers, see + http://mosquitto.org/ + Eclipse paho project provides number of clients and utilities for working with MQTT, see + http://www.eclipse.org/paho/#getting-started + + and then run the example + `$ bin/spark-submit --jars \ + external/mqtt-assembly/target/scala-*/spark-streaming-mqtt-assembly-*.jar \ + examples/src/main/python/streaming/mqtt_wordcount.py \ + tcp://localhost:1883 foo` +""" + +import sys + +from pyspark import SparkContext +from pyspark.streaming import StreamingContext +from pyspark.streaming.mqtt import MQTTUtils + +if __name__ == "__main__": + if len(sys.argv) != 3: + print >> sys.stderr, "Usage: mqtt_wordcount.py " + exit(-1) + + sc = SparkContext(appName="PythonStreamingMQTTWordCount") + ssc = StreamingContext(sc, 1) + + brokerUrl = sys.argv[1] + topic = sys.argv[2] + + lines = MQTTUtils.createStream(ssc, brokerUrl, topic) + counts = lines.flatMap(lambda line: line.split(" ")) \ + .map(lambda word: (word, 1)) \ + .reduceByKey(lambda a, b: a+b) + counts.pprint() + + ssc.start() + ssc.awaitTermination() diff --git a/examples/src/main/python/streaming/queue_stream.py b/examples/src/main/python/streaming/queue_stream.py index dcd6a0fc6ff9..b3808907f74a 100644 --- a/examples/src/main/python/streaming/queue_stream.py +++ b/examples/src/main/python/streaming/queue_stream.py @@ -36,8 +36,8 @@ # Create the queue through which RDDs can be pushed to # a QueueInputDStream rddQueue = [] - for i in xrange(5): - rddQueue += [ssc.sparkContext.parallelize([j for j in xrange(1, 1001)], 10)] + for i in range(5): + rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)] # Create the QueueInputDStream and use it do some processing inputStream = ssc.queueStream(rddQueue) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala index 78f31b4ffe56..340c3559b15e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala @@ -179,7 +179,7 @@ private class MyLogisticRegressionModel( * This is used for the default implementation of [[transform()]]. */ override def copy(extra: ParamMap): MyLogisticRegressionModel = { - copyValues(new MyLogisticRegressionModel(uid, weights), extra) + copyValues(new MyLogisticRegressionModel(uid, weights), extra).setParent(parent) } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala index 7682557127b5..8e3760ddb50a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala @@ -136,6 +136,7 @@ object LogisticRegressionExample { .setElasticNetParam(params.elasticNetParam) .setMaxIter(params.maxIter) .setTol(params.tol) + .setFitIntercept(params.fitIntercept) stages += lor val pipeline = new Pipeline().setStages(stages.toArray) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala index cd411397a4b9..3ae53e57dbdb 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala @@ -76,7 +76,7 @@ object MovieLensALS { .text("path to a MovieLens dataset of movies") .action((x, c) => c.copy(movies = x)) opt[Int]("rank") - .text(s"rank, default: ${defaultParams.rank}}") + .text(s"rank, default: ${defaultParams.rank}") .action((x, c) => c.copy(rank = x)) opt[Int]("maxIter") .text(s"max number of iterations, default: ${defaultParams.maxIter}") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala index 58d7b67674ff..f4d1fe57856a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala @@ -70,7 +70,7 @@ object SimpleParamsExample { // which supports several methods for specifying parameters. val paramMap = ParamMap(lr.maxIter -> 20) paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. - paramMap.put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params. + paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.45, 0.55)) // Specify multiple Params. // One can also combine ParamMaps. val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala new file mode 100644 index 000000000000..1abdf219b1c0 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml + +import org.apache.spark.ml.evaluation.RegressionEvaluator +import org.apache.spark.ml.regression.LinearRegression +import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +/** + * A simple example demonstrating model selection using TrainValidationSplit. + * + * The example is based on [[SimpleParamsExample]] using linear regression. + * Run with + * {{{ + * bin/run-example ml.TrainValidationSplitExample + * }}} + */ +object TrainValidationSplitExample { + + def main(args: Array[String]): Unit = { + val conf = new SparkConf().setAppName("TrainValidationSplitExample") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + import sqlContext.implicits._ + + // Prepare training and test data. + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345) + + val lr = new LinearRegression() + + // We use a ParamGridBuilder to construct a grid of parameters to search over. + // TrainValidationSplit will try all combinations of values and determine best model using + // the evaluator. + val paramGrid = new ParamGridBuilder() + .addGrid(lr.regParam, Array(0.1, 0.01)) + .addGrid(lr.fitIntercept, Array(true, false)) + .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0)) + .build() + + // In this case the estimator is simply the linear regression. + // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. + val trainValidationSplit = new TrainValidationSplit() + .setEstimator(lr) + .setEvaluator(new RegressionEvaluator) + .setEstimatorParamMaps(paramGrid) + + // 80% of the data will be used for training and the remaining 20% for validation. + trainValidationSplit.setTrainRatio(0.8) + + // Run train validation split, and choose the best set of parameters. + val model = trainValidationSplit.fit(training) + + // Make predictions on test data. model is the model with combination of parameters + // that performed best. + model.transform(test) + .select("features", "label", "prediction") + .show() + + sc.stop() + } +} diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala index 57ffe3dd2524..cc6bce3cb7c9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala @@ -100,7 +100,7 @@ object DecisionTreeRunner { .action((x, c) => c.copy(numTrees = x)) opt[String]("featureSubsetStrategy") .text(s"feature subset sampling strategy" + - s" (${RandomForest.supportedFeatureSubsetStrategies.mkString(", ")}}), " + + s" (${RandomForest.supportedFeatureSubsetStrategies.mkString(", ")}), " + s"default: ${defaultParams.featureSubsetStrategy}") .action((x, c) => c.copy(featureSubsetStrategy = x)) opt[Double]("fracTest") diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala index e43a6f2864c7..69691ae297f6 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala @@ -55,7 +55,7 @@ object MovieLensALS { val parser = new OptionParser[Params]("MovieLensALS") { head("MovieLensALS: an example app for ALS on MovieLens data.") opt[Int]("rank") - .text(s"rank, default: ${defaultParams.rank}}") + .text(s"rank, default: ${defaultParams.rank}") .action((x, c) => c.copy(rank = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 13189595d1d6..5eda12de8230 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml @@ -68,6 +68,11 @@ commons-codec provided + + commons-lang + commons-lang + provided + commons-net commons-net @@ -88,6 +93,12 @@ avro-ipc provided + + org.apache.avro + avro-mapred + ${avro.mapred.classifier} + provided + org.scala-lang scala-library @@ -104,7 +115,6 @@ maven-shade-plugin false - ${project.build.directory}/scala-${scala.binary.version}/spark-streaming-flume-assembly-${project.version}.jar *:* diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 0664cfb2021e..33f2cd77677f 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 14f7daaf417e..670c783726c7 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala index 095bfb0c73a9..a65a9b921aaf 100644 --- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala +++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala @@ -247,7 +247,7 @@ object FlumeUtils { * This is a helper class that wraps the methods in FlumeUtils into more Python-friendly class and * function so that it can be easily instantiated and called from Python's FlumeUtils. */ -private class FlumeUtilsPythonHelper { +private[flume] class FlumeUtilsPythonHelper { def createStream( jssc: JavaStreamingContext, diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml index 977514fa5a1e..6c163d2189bd 100644 --- a/external/kafka-assembly/pom.xml +++ b/external/kafka-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml @@ -47,6 +47,90 @@ ${project.version} provided + + + commons-codec + commons-codec + provided + + + commons-lang + commons-lang + provided + + + com.google.protobuf + protobuf-java + provided + + + com.sun.jersey + jersey-server + provided + + + com.sun.jersey + jersey-core + provided + + + net.jpountz.lz4 + lz4 + provided + + + org.apache.hadoop + hadoop-client + provided + + + org.apache.avro + avro-mapred + ${avro.mapred.classifier} + provided + + + org.apache.curator + curator-recipes + provided + + + org.apache.zookeeper + zookeeper + provided + + + log4j + log4j + provided + + + net.java.dev.jets3t + jets3t + provided + + + org.scala-lang + scala-library + provided + + + org.slf4j + slf4j-api + provided + + + org.slf4j + slf4j-log4j12 + provided + + + org.xerial.snappy + snappy-java + provided + @@ -58,7 +142,6 @@ maven-shade-plugin false - ${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kafka-assembly-${project.version}.jar *:* diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index ded863bd985e..bd2523b01efc 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala index 5a74febb4bd4..9159051ba06e 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala @@ -20,11 +20,9 @@ package org.apache.spark.streaming.kafka import org.apache.spark.annotation.Experimental /** - * :: Experimental :: - * Represent the host and port info for a Kafka broker. - * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID + * Represents the host and port info for a Kafka broker. + * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID. */ -@Experimental final class Broker private( /** Broker's hostname */ val host: String, diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala index 48a1933d92f8..1000094e93cb 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala @@ -29,7 +29,8 @@ import org.apache.spark.{Logging, SparkException} import org.apache.spark.streaming.{StreamingContext, Time} import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset -import org.apache.spark.streaming.scheduler.StreamInputInfo +import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo} +import org.apache.spark.streaming.scheduler.rate.RateEstimator /** * A stream of {@link org.apache.spark.streaming.kafka.KafkaRDD} where @@ -61,7 +62,7 @@ class DirectKafkaInputDStream[ val kafkaParams: Map[String, String], val fromOffsets: Map[TopicAndPartition, Long], messageHandler: MessageAndMetadata[K, V] => R -) extends InputDStream[R](ssc_) with Logging { + ) extends InputDStream[R](ssc_) with Logging { val maxRetries = context.sparkContext.getConf.getInt( "spark.streaming.kafka.maxRetries", 1) @@ -71,14 +72,40 @@ class DirectKafkaInputDStream[ protected[streaming] override val checkpointData = new DirectKafkaInputDStreamCheckpointData + + /** + * Asynchronously maintains & sends new rate limits to the receiver through the receiver tracker. + */ + override protected[streaming] val rateController: Option[RateController] = { + if (RateController.isBackPressureEnabled(ssc.conf)) { + Some(new DirectKafkaRateController(id, + RateEstimator.create(ssc.conf, ssc_.graph.batchDuration))) + } else { + None + } + } + protected val kc = new KafkaCluster(kafkaParams) - protected val maxMessagesPerPartition: Option[Long] = { - val ratePerSec = context.sparkContext.getConf.getInt( + private val maxRateLimitPerPartition: Int = context.sparkContext.getConf.getInt( "spark.streaming.kafka.maxRatePerPartition", 0) - if (ratePerSec > 0) { + protected def maxMessagesPerPartition: Option[Long] = { + val estimatedRateLimit = rateController.map(_.getLatestRate().toInt) + val numPartitions = currentOffsets.keys.size + + val effectiveRateLimitPerPartition = estimatedRateLimit + .filter(_ > 0) + .map { limit => + if (maxRateLimitPerPartition > 0) { + Math.min(maxRateLimitPerPartition, (limit / numPartitions)) + } else { + limit / numPartitions + } + }.getOrElse(maxRateLimitPerPartition) + + if (effectiveRateLimitPerPartition > 0) { val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000 - Some((secsPerBatch * ratePerSec).toLong) + Some((secsPerBatch * effectiveRateLimitPerPartition).toLong) } else { None } @@ -170,11 +197,18 @@ class DirectKafkaInputDStream[ val leaders = KafkaCluster.checkErrors(kc.findLeaders(topics)) batchForTime.toSeq.sortBy(_._1)(Time.ordering).foreach { case (t, b) => - logInfo(s"Restoring KafkaRDD for time $t ${b.mkString("[", ", ", "]")}") - generatedRDDs += t -> new KafkaRDD[K, V, U, T, R]( - context.sparkContext, kafkaParams, b.map(OffsetRange(_)), leaders, messageHandler) + logInfo(s"Restoring KafkaRDD for time $t ${b.mkString("[", ", ", "]")}") + generatedRDDs += t -> new KafkaRDD[K, V, U, T, R]( + context.sparkContext, kafkaParams, b.map(OffsetRange(_)), leaders, messageHandler) } } } + /** + * A RateController to retrieve the rate from RateEstimator. + */ + private[streaming] class DirectKafkaRateController(id: Int, estimator: RateEstimator) + extends RateController(id, estimator) { + override def publish(rate: Long): Unit = () + } } diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala index 1a9d78c0d4f5..ea5f842c6caf 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala @@ -197,7 +197,11 @@ class KafkaRDD[ .dropWhile(_.offset < requestOffset) } - override def close(): Unit = consumer.close() + override def close(): Unit = { + if (consumer != null) { + consumer.close() + } + } override def getNext(): R = { if (iter == null || !iter.hasNext) { diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala index b608b7595272..79a9db4291be 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala @@ -20,9 +20,8 @@ package org.apache.spark.streaming.kafka import java.io.File import java.lang.{Integer => JInt} import java.net.InetSocketAddress -import java.util.{Map => JMap} -import java.util.Properties import java.util.concurrent.TimeoutException +import java.util.{Map => JMap, Properties} import scala.annotation.tailrec import scala.language.postfixOps @@ -30,17 +29,16 @@ import scala.util.control.NonFatal import kafka.admin.AdminUtils import kafka.api.Request -import kafka.common.TopicAndPartition import kafka.producer.{KeyedMessage, Producer, ProducerConfig} import kafka.serializer.StringEncoder import kafka.server.{KafkaConfig, KafkaServer} import kafka.utils.{ZKStringSerializer, ZkUtils} -import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer} import org.I0Itec.zkclient.ZkClient +import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer} -import org.apache.spark.{Logging, SparkConf} import org.apache.spark.streaming.Time import org.apache.spark.util.Utils +import org.apache.spark.{Logging, SparkConf} /** * This is a helper class for Kafka test suites. This has the functionality to set up @@ -48,7 +46,7 @@ import org.apache.spark.util.Utils * * The reason to put Kafka test utility class in src is to test Python related Kafka APIs. */ -private class KafkaTestUtils extends Logging { +private[kafka] class KafkaTestUtils extends Logging { // Zookeeper related configurations private val zkHost = "localhost" diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala index f3b01bd60b17..388dbb818410 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala @@ -17,29 +17,25 @@ package org.apache.spark.streaming.kafka -import java.lang.{Integer => JInt} -import java.lang.{Long => JLong} -import java.util.{Map => JMap} -import java.util.{Set => JSet} -import java.util.{List => JList} +import java.lang.{Integer => JInt, Long => JLong} +import java.util.{List => JList, Map => JMap, Set => JSet} -import scala.reflect.ClassTag import scala.collection.JavaConversions._ +import scala.reflect.ClassTag import kafka.common.TopicAndPartition import kafka.message.MessageAndMetadata -import kafka.serializer.{DefaultDecoder, Decoder, StringDecoder} +import kafka.serializer.{Decoder, DefaultDecoder, StringDecoder} import org.apache.spark.api.java.function.{Function => JFunction} -import org.apache.spark.streaming.util.WriteAheadLogUtils -import org.apache.spark.{SparkContext, SparkException} -import org.apache.spark.annotation.Experimental +import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext -import org.apache.spark.streaming.api.java.{JavaPairInputDStream, JavaInputDStream, JavaPairReceiverInputDStream, JavaStreamingContext} +import org.apache.spark.streaming.api.java.{JavaInputDStream, JavaPairInputDStream, JavaPairReceiverInputDStream, JavaStreamingContext} import org.apache.spark.streaming.dstream.{InputDStream, ReceiverInputDStream} -import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD} +import org.apache.spark.streaming.util.WriteAheadLogUtils +import org.apache.spark.{SparkContext, SparkException} object KafkaUtils { /** @@ -196,7 +192,6 @@ object KafkaUtils { * @param offsetRanges Each OffsetRange in the batch corresponds to a * range of offsets for a given Kafka topic/partition */ - @Experimental def createRDD[ K: ClassTag, V: ClassTag, @@ -214,7 +209,6 @@ object KafkaUtils { } /** - * :: Experimental :: * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you * specify the Kafka leader to connect to (to optimize fetching) and access the message as well * as the metadata. @@ -230,7 +224,6 @@ object KafkaUtils { * in which case leaders will be looked up on the driver. * @param messageHandler Function for translating each message and metadata into the desired type */ - @Experimental def createRDD[ K: ClassTag, V: ClassTag, @@ -268,7 +261,6 @@ object KafkaUtils { * @param offsetRanges Each OffsetRange in the batch corresponds to a * range of offsets for a given Kafka topic/partition */ - @Experimental def createRDD[K, V, KD <: Decoder[K], VD <: Decoder[V]]( jsc: JavaSparkContext, keyClass: Class[K], @@ -287,7 +279,6 @@ object KafkaUtils { } /** - * :: Experimental :: * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you * specify the Kafka leader to connect to (to optimize fetching) and access the message as well * as the metadata. @@ -303,7 +294,6 @@ object KafkaUtils { * in which case leaders will be looked up on the driver. * @param messageHandler Function for translating each message and metadata into the desired type */ - @Experimental def createRDD[K, V, KD <: Decoder[K], VD <: Decoder[V], R]( jsc: JavaSparkContext, keyClass: Class[K], @@ -327,7 +317,6 @@ object KafkaUtils { } /** - * :: Experimental :: * Create an input stream that directly pulls messages from Kafka Brokers * without using any receiver. This stream can guarantee that each message * from Kafka is included in transformations exactly once (see points below). @@ -357,7 +346,6 @@ object KafkaUtils { * starting point of the stream * @param messageHandler Function for translating each message and metadata into the desired type */ - @Experimental def createDirectStream[ K: ClassTag, V: ClassTag, @@ -375,7 +363,6 @@ object KafkaUtils { } /** - * :: Experimental :: * Create an input stream that directly pulls messages from Kafka Brokers * without using any receiver. This stream can guarantee that each message * from Kafka is included in transformations exactly once (see points below). @@ -405,7 +392,6 @@ object KafkaUtils { * to determine where the stream starts (defaults to "largest") * @param topics Names of the topics to consume */ - @Experimental def createDirectStream[ K: ClassTag, V: ClassTag, @@ -437,7 +423,6 @@ object KafkaUtils { } /** - * :: Experimental :: * Create an input stream that directly pulls messages from Kafka Brokers * without using any receiver. This stream can guarantee that each message * from Kafka is included in transformations exactly once (see points below). @@ -472,7 +457,6 @@ object KafkaUtils { * starting point of the stream * @param messageHandler Function for translating each message and metadata into the desired type */ - @Experimental def createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V], R]( jssc: JavaStreamingContext, keyClass: Class[K], @@ -499,7 +483,6 @@ object KafkaUtils { } /** - * :: Experimental :: * Create an input stream that directly pulls messages from Kafka Brokers * without using any receiver. This stream can guarantee that each message * from Kafka is included in transformations exactly once (see points below). @@ -533,7 +516,6 @@ object KafkaUtils { * to determine where the stream starts (defaults to "largest") * @param topics Names of the topics to consume */ - @Experimental def createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V]]( jssc: JavaStreamingContext, keyClass: Class[K], @@ -564,7 +546,7 @@ object KafkaUtils { * classOf[KafkaUtilsPythonHelper].newInstance(), and the createStream() * takes care of known parameters instead of passing them from Python */ -private class KafkaUtilsPythonHelper { +private[kafka] class KafkaUtilsPythonHelper { def createStream( jssc: JavaStreamingContext, kafkaParams: JMap[String, String], diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala index f326e7f1f6f8..8a5f37149451 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala @@ -19,10 +19,7 @@ package org.apache.spark.streaming.kafka import kafka.common.TopicAndPartition -import org.apache.spark.annotation.Experimental - /** - * :: Experimental :: * Represents any object that has a collection of [[OffsetRange]]s. This can be used access the * offset ranges in RDDs generated by the direct Kafka DStream (see * [[KafkaUtils.createDirectStream()]]). @@ -33,25 +30,22 @@ import org.apache.spark.annotation.Experimental * } * }}} */ -@Experimental trait HasOffsetRanges { def offsetRanges: Array[OffsetRange] } /** - * :: Experimental :: * Represents a range of offsets from a single Kafka TopicAndPartition. Instances of this class * can be created with `OffsetRange.create()`. + * @param topic Kafka topic name + * @param partition Kafka partition id + * @param fromOffset Inclusive starting offset + * @param untilOffset Exclusive ending offset */ -@Experimental final class OffsetRange private( - /** Kafka topic name */ val topic: String, - /** Kafka partition id */ val partition: Int, - /** inclusive starting offset */ val fromOffset: Long, - /** exclusive ending offset */ val untilOffset: Long) extends Serializable { import OffsetRange.OffsetRangeTuple @@ -84,10 +78,8 @@ final class OffsetRange private( } /** - * :: Experimental :: * Companion object the provides methods to create instances of [[OffsetRange]]. */ -@Experimental object OffsetRange { def create(topic: String, partition: Int, fromOffset: Long, untilOffset: Long): OffsetRange = new OffsetRange(topic, partition, fromOffset, untilOffset) diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala index 75f0dfc22b9d..764d170934aa 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala @@ -96,7 +96,7 @@ class ReliableKafkaReceiver[ blockOffsetMap = new ConcurrentHashMap[StreamBlockId, Map[TopicAndPartition, Long]]() // Initialize the block generator for storing Kafka message. - blockGenerator = new BlockGenerator(new GeneratedBlockHandler, streamId, conf) + blockGenerator = supervisor.createBlockGenerator(new GeneratedBlockHandler) if (kafkaParams.contains(AUTO_OFFSET_COMMIT) && kafkaParams(AUTO_OFFSET_COMMIT) == "true") { logWarning(s"$AUTO_OFFSET_COMMIT should be set to false in ReliableKafkaReceiver, " + diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java index 02cd24a35906..9db07d0507fe 100644 --- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java +++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java @@ -70,7 +70,7 @@ public void testKafkaStream() throws InterruptedException { final String topic1 = "topic1"; final String topic2 = "topic2"; // hold a reference to the current offset ranges, so it can be used downstream - final AtomicReference offsetRanges = new AtomicReference(); + final AtomicReference offsetRanges = new AtomicReference<>(); String[] topic1data = createTopicAndSendData(topic1); String[] topic2data = createTopicAndSendData(topic2); diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala index 5b3c79444aa6..02225d5aa7cc 100644 --- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala +++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala @@ -20,6 +20,9 @@ package org.apache.spark.streaming.kafka import java.io.File import java.util.concurrent.atomic.AtomicLong +import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset +import org.apache.spark.streaming.scheduler.rate.RateEstimator + import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.concurrent.duration._ @@ -350,6 +353,77 @@ class DirectKafkaStreamSuite ssc.stop() } + test("using rate controller") { + val topic = "backpressure" + val topicPartition = TopicAndPartition(topic, 0) + kafkaTestUtils.createTopic(topic) + val kafkaParams = Map( + "metadata.broker.list" -> kafkaTestUtils.brokerAddress, + "auto.offset.reset" -> "smallest" + ) + + val batchIntervalMilliseconds = 100 + val estimator = new ConstantEstimator(100) + val messageKeys = (1 to 200).map(_.toString) + val messages = messageKeys.map((_, 1)).toMap + + val sparkConf = new SparkConf() + // Safe, even with streaming, because we're using the direct API. + // Using 1 core is useful to make the test more predictable. + .setMaster("local[1]") + .setAppName(this.getClass.getSimpleName) + .set("spark.streaming.kafka.maxRatePerPartition", "100") + + // Setup the streaming context + ssc = new StreamingContext(sparkConf, Milliseconds(batchIntervalMilliseconds)) + + val kafkaStream = withClue("Error creating direct stream") { + val kc = new KafkaCluster(kafkaParams) + val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message) + val m = kc.getEarliestLeaderOffsets(Set(topicPartition)) + .fold(e => Map.empty[TopicAndPartition, Long], m => m.mapValues(lo => lo.offset)) + + new DirectKafkaInputDStream[String, String, StringDecoder, StringDecoder, (String, String)]( + ssc, kafkaParams, m, messageHandler) { + override protected[streaming] val rateController = + Some(new DirectKafkaRateController(id, estimator)) + } + } + + val collectedData = + new mutable.ArrayBuffer[Array[String]]() with mutable.SynchronizedBuffer[Array[String]] + + // Used for assertion failure messages. + def dataToString: String = + collectedData.map(_.mkString("[", ",", "]")).mkString("{", ", ", "}") + + // This is to collect the raw data received from Kafka + kafkaStream.foreachRDD { (rdd: RDD[(String, String)], time: Time) => + val data = rdd.map { _._2 }.collect() + collectedData += data + } + + ssc.start() + + // Try different rate limits. + // Send data to Kafka and wait for arrays of data to appear matching the rate. + Seq(100, 50, 20).foreach { rate => + collectedData.clear() // Empty this buffer on each pass. + estimator.updateRate(rate) // Set a new rate. + // Expect blocks of data equal to "rate", scaled by the interval length in secs. + val expectedSize = Math.round(rate * batchIntervalMilliseconds * 0.001) + kafkaTestUtils.sendMessages(topic, messages) + eventually(timeout(5.seconds), interval(batchIntervalMilliseconds.milliseconds)) { + // Assert that rate estimator values are used to determine maxMessagesPerPartition. + // Funky "-" in message makes the complete assertion message read better. + assert(collectedData.exists(_.size == expectedSize), + s" - No arrays of size $expectedSize for rate $rate found in $dataToString") + } + } + + ssc.stop() + } + /** Get the generated offset ranges from the DirectKafkaStream */ private def getOffsetRanges[K, V]( kafkaStream: DStream[(K, V)]): Seq[(Time, Array[OffsetRange])] = { @@ -381,3 +455,18 @@ object DirectKafkaStreamSuite { } } } + +private[streaming] class ConstantEstimator(@volatile private var rate: Long) + extends RateEstimator { + + def updateRate(newRate: Long): Unit = { + rate = newRate + } + + def compute( + time: Long, + elements: Long, + processingDelay: Long, + schedulingDelay: Long): Option[Double] = Some(rate) +} + diff --git a/external/mqtt-assembly/pom.xml b/external/mqtt-assembly/pom.xml new file mode 100644 index 000000000000..a5186151d4f6 --- /dev/null +++ b/external/mqtt-assembly/pom.xml @@ -0,0 +1,175 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.10 + 1.5.1-SNAPSHOT + ../../pom.xml + + + org.apache.spark + spark-streaming-mqtt-assembly_2.10 + jar + Spark Project External MQTT Assembly + http://spark.apache.org/ + + + streaming-mqtt-assembly + + + + + org.apache.spark + spark-streaming-mqtt_${scala.binary.version} + ${project.version} + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${project.version} + provided + + + + commons-lang + commons-lang + provided + + + com.google.protobuf + protobuf-java + provided + + + com.sun.jersey + jersey-server + provided + + + com.sun.jersey + jersey-core + provided + + + org.apache.hadoop + hadoop-client + provided + + + org.apache.avro + avro-mapred + ${avro.mapred.classifier} + provided + + + org.apache.curator + curator-recipes + provided + + + org.apache.zookeeper + zookeeper + provided + + + log4j + log4j + provided + + + net.java.dev.jets3t + jets3t + provided + + + org.scala-lang + scala-library + provided + + + org.slf4j + slf4j-api + provided + + + org.slf4j + slf4j-log4j12 + provided + + + org.xerial.snappy + snappy-java + provided + + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + org.apache.maven.plugins + maven-shade-plugin + + false + + + *:* + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + package + + shade + + + + + + reference.conf + + + log4j.properties + + + + + + + + + + + diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 0e41e5781784..6dbcb86e6924 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml @@ -78,5 +78,33 @@ target/scala-${scala.binary.version}/classes target/scala-${scala.binary.version}/test-classes + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + test-jar-with-dependencies + package + + single + + + + spark-streaming-mqtt-test-${project.version} + ${project.build.directory}/scala-${scala.binary.version}/ + false + + false + + src/main/assembly/assembly.xml + + + + + + diff --git a/external/mqtt/src/main/assembly/assembly.xml b/external/mqtt/src/main/assembly/assembly.xml new file mode 100644 index 000000000000..ecab5b360eb3 --- /dev/null +++ b/external/mqtt/src/main/assembly/assembly.xml @@ -0,0 +1,44 @@ + + + test-jar-with-dependencies + + jar + + false + + + + ${project.build.directory}/scala-${scala.binary.version}/test-classes + / + + + + + + true + test + true + + org.apache.hadoop:*:jar + org.apache.zookeeper:*:jar + org.apache.avro:*:jar + + + + + diff --git a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala index 1142d0f56ba3..7b8d56d6faf2 100644 --- a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala +++ b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala @@ -21,8 +21,8 @@ import scala.reflect.ClassTag import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext -import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext, JavaDStream} -import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream} +import org.apache.spark.streaming.api.java.{JavaDStream, JavaReceiverInputDStream, JavaStreamingContext} +import org.apache.spark.streaming.dstream.ReceiverInputDStream object MQTTUtils { /** @@ -74,3 +74,19 @@ object MQTTUtils { createStream(jssc.ssc, brokerUrl, topic, storageLevel) } } + +/** + * This is a helper class that wraps the methods in MQTTUtils into more Python-friendly class and + * function so that it can be easily instantiated and called from Python's MQTTUtils. + */ +private[mqtt] class MQTTUtilsPythonHelper { + + def createStream( + jssc: JavaStreamingContext, + brokerUrl: String, + topic: String, + storageLevel: StorageLevel + ): JavaDStream[String] = { + MQTTUtils.createStream(jssc, brokerUrl, topic, storageLevel) + } +} diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala index c4bf5aa7869b..a6a9249db8ed 100644 --- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala +++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala @@ -17,46 +17,30 @@ package org.apache.spark.streaming.mqtt -import java.net.{URI, ServerSocket} -import java.util.concurrent.CountDownLatch -import java.util.concurrent.TimeUnit - import scala.concurrent.duration._ import scala.language.postfixOps -import org.apache.activemq.broker.{TransportConnector, BrokerService} -import org.apache.commons.lang3.RandomUtils -import org.eclipse.paho.client.mqttv3._ -import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence - import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually -import org.apache.spark.streaming.{Milliseconds, StreamingContext} -import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming.dstream.ReceiverInputDStream -import org.apache.spark.streaming.scheduler.StreamingListener -import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted import org.apache.spark.{SparkConf, SparkFunSuite} -import org.apache.spark.util.Utils +import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.{Milliseconds, StreamingContext} class MQTTStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter { private val batchDuration = Milliseconds(500) private val master = "local[2]" private val framework = this.getClass.getSimpleName - private val freePort = findFreePort() - private val brokerUri = "//localhost:" + freePort private val topic = "def" - private val persistenceDir = Utils.createTempDir() private var ssc: StreamingContext = _ - private var broker: BrokerService = _ - private var connector: TransportConnector = _ + private var mqttTestUtils: MQTTTestUtils = _ before { ssc = new StreamingContext(master, framework, batchDuration) - setupMQTT() + mqttTestUtils = new MQTTTestUtils + mqttTestUtils.setup() } after { @@ -64,14 +48,17 @@ class MQTTStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter ssc.stop() ssc = null } - Utils.deleteRecursively(persistenceDir) - tearDownMQTT() + if (mqttTestUtils != null) { + mqttTestUtils.teardown() + mqttTestUtils = null + } } test("mqtt input stream") { val sendMessage = "MQTT demo for spark streaming" - val receiveStream = - MQTTUtils.createStream(ssc, "tcp:" + brokerUri, topic, StorageLevel.MEMORY_ONLY) + val receiveStream = MQTTUtils.createStream(ssc, "tcp://" + mqttTestUtils.brokerUri, topic, + StorageLevel.MEMORY_ONLY) + @volatile var receiveMessage: List[String] = List() receiveStream.foreachRDD { rdd => if (rdd.collect.length > 0) { @@ -79,89 +66,14 @@ class MQTTStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter receiveMessage } } - ssc.start() - // wait for the receiver to start before publishing data, or we risk failing - // the test nondeterministically. See SPARK-4631 - waitForReceiverToStart() + ssc.start() - publishData(sendMessage) + // Retry it because we don't know when the receiver will start. eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { + mqttTestUtils.publishData(topic, sendMessage) assert(sendMessage.equals(receiveMessage(0))) } ssc.stop() } - - private def setupMQTT() { - broker = new BrokerService() - broker.setDataDirectoryFile(Utils.createTempDir()) - connector = new TransportConnector() - connector.setName("mqtt") - connector.setUri(new URI("mqtt:" + brokerUri)) - broker.addConnector(connector) - broker.start() - } - - private def tearDownMQTT() { - if (broker != null) { - broker.stop() - broker = null - } - if (connector != null) { - connector.stop() - connector = null - } - } - - private def findFreePort(): Int = { - val candidatePort = RandomUtils.nextInt(1024, 65536) - Utils.startServiceOnPort(candidatePort, (trialPort: Int) => { - val socket = new ServerSocket(trialPort) - socket.close() - (null, trialPort) - }, new SparkConf())._2 - } - - def publishData(data: String): Unit = { - var client: MqttClient = null - try { - val persistence = new MqttDefaultFilePersistence(persistenceDir.getAbsolutePath) - client = new MqttClient("tcp:" + brokerUri, MqttClient.generateClientId(), persistence) - client.connect() - if (client.isConnected) { - val msgTopic = client.getTopic(topic) - val message = new MqttMessage(data.getBytes("utf-8")) - message.setQos(1) - message.setRetained(true) - - for (i <- 0 to 10) { - try { - msgTopic.publish(message) - } catch { - case e: MqttException if e.getReasonCode == MqttException.REASON_CODE_MAX_INFLIGHT => - // wait for Spark streaming to consume something from the message queue - Thread.sleep(50) - } - } - } - } finally { - client.disconnect() - client.close() - client = null - } - } - - /** - * Block until at least one receiver has started or timeout occurs. - */ - private def waitForReceiverToStart() = { - val latch = new CountDownLatch(1) - ssc.addStreamingListener(new StreamingListener { - override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) { - latch.countDown() - } - }) - - assert(latch.await(10, TimeUnit.SECONDS), "Timeout waiting for receiver to start.") - } } diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala new file mode 100644 index 000000000000..1618e2c088b7 --- /dev/null +++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.mqtt + +import java.net.{ServerSocket, URI} + +import scala.language.postfixOps + +import com.google.common.base.Charsets.UTF_8 +import org.apache.activemq.broker.{BrokerService, TransportConnector} +import org.apache.commons.lang3.RandomUtils +import org.eclipse.paho.client.mqttv3._ +import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence + +import org.apache.spark.util.Utils +import org.apache.spark.{Logging, SparkConf} + +/** + * Share codes for Scala and Python unit tests + */ +private[mqtt] class MQTTTestUtils extends Logging { + + private val persistenceDir = Utils.createTempDir() + private val brokerHost = "localhost" + private val brokerPort = findFreePort() + + private var broker: BrokerService = _ + private var connector: TransportConnector = _ + + def brokerUri: String = { + s"$brokerHost:$brokerPort" + } + + def setup(): Unit = { + broker = new BrokerService() + broker.setDataDirectoryFile(Utils.createTempDir()) + connector = new TransportConnector() + connector.setName("mqtt") + connector.setUri(new URI("mqtt://" + brokerUri)) + broker.addConnector(connector) + broker.start() + } + + def teardown(): Unit = { + if (broker != null) { + broker.stop() + broker = null + } + if (connector != null) { + connector.stop() + connector = null + } + Utils.deleteRecursively(persistenceDir) + } + + private def findFreePort(): Int = { + val candidatePort = RandomUtils.nextInt(1024, 65536) + Utils.startServiceOnPort(candidatePort, (trialPort: Int) => { + val socket = new ServerSocket(trialPort) + socket.close() + (null, trialPort) + }, new SparkConf())._2 + } + + def publishData(topic: String, data: String): Unit = { + var client: MqttClient = null + try { + val persistence = new MqttDefaultFilePersistence(persistenceDir.getAbsolutePath) + client = new MqttClient("tcp://" + brokerUri, MqttClient.generateClientId(), persistence) + client.connect() + if (client.isConnected) { + val msgTopic = client.getTopic(topic) + val message = new MqttMessage(data.getBytes(UTF_8)) + message.setQos(1) + message.setRetained(true) + + for (i <- 0 to 10) { + try { + msgTopic.publish(message) + } catch { + case e: MqttException if e.getReasonCode == MqttException.REASON_CODE_MAX_INFLIGHT => + // wait for Spark streaming to consume something from the message queue + Thread.sleep(50) + } + } + } + } finally { + if (client != null) { + client.disconnect() + client.close() + client = null + } + } + } + +} diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index 178ae8de13b5..507e7d3293a2 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index 37bfd10d4366..7d9b64208cdf 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml index 3636a9037d43..52f948ee4d65 100644 --- a/extras/java8-tests/pom.xml +++ b/extras/java8-tests/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/extras/kinesis-asl-assembly/pom.xml b/extras/kinesis-asl-assembly/pom.xml index 70d2c9c58f54..b767a24d53ca 100644 --- a/extras/kinesis-asl-assembly/pom.xml +++ b/extras/kinesis-asl-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml @@ -47,6 +47,85 @@ ${project.version} provided + + + com.fasterxml.jackson.core + jackson-databind + provided + + + commons-lang + commons-lang + provided + + + com.google.protobuf + protobuf-java + provided + + + com.sun.jersey + jersey-server + provided + + + com.sun.jersey + jersey-core + provided + + + log4j + log4j + provided + + + net.java.dev.jets3t + jets3t + provided + + + org.apache.hadoop + hadoop-client + provided + + + org.apache.avro + avro-ipc + provided + + + org.apache.avro + avro-mapred + ${avro.mapred.classifier} + provided + + + org.apache.curator + curator-recipes + provided + + + org.apache.zookeeper + zookeeper + provided + + + org.slf4j + slf4j-api + provided + + + org.slf4j + slf4j-log4j12 + provided + + + org.xerial.snappy + snappy-java + provided + @@ -58,7 +137,6 @@ maven-shade-plugin false - ${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kinesis-asl-assembly-${project.version}.jar *:* diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index c242e7a57b9a..e95a4ae84c91 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml @@ -31,7 +31,7 @@ Spark Kinesis Integration - kinesis-asl + streaming-kinesis-asl diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala index 8f144a4d974a..a003ddf325e6 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala @@ -37,16 +37,18 @@ case class SequenceNumberRange( /** Class representing an array of Kinesis sequence number ranges */ private[kinesis] -case class SequenceNumberRanges(ranges: Array[SequenceNumberRange]) { +case class SequenceNumberRanges(ranges: Seq[SequenceNumberRange]) { def isEmpty(): Boolean = ranges.isEmpty + def nonEmpty(): Boolean = ranges.nonEmpty + override def toString(): String = ranges.mkString("SequenceNumberRanges(", ", ", ")") } private[kinesis] object SequenceNumberRanges { def apply(range: SequenceNumberRange): SequenceNumberRanges = { - new SequenceNumberRanges(Array(range)) + new SequenceNumberRanges(Seq(range)) } } @@ -66,14 +68,14 @@ class KinesisBackedBlockRDDPartition( */ private[kinesis] class KinesisBackedBlockRDD( - sc: SparkContext, - regionId: String, - endpointUrl: String, + @transient sc: SparkContext, + val regionName: String, + val endpointUrl: String, @transient blockIds: Array[BlockId], - @transient arrayOfseqNumberRanges: Array[SequenceNumberRanges], + @transient val arrayOfseqNumberRanges: Array[SequenceNumberRanges], @transient isBlockIdValid: Array[Boolean] = Array.empty, - retryTimeoutMs: Int = 10000, - awsCredentialsOption: Option[SerializableAWSCredentials] = None + val retryTimeoutMs: Int = 10000, + val awsCredentialsOption: Option[SerializableAWSCredentials] = None ) extends BlockRDD[Array[Byte]](sc, blockIds) { require(blockIds.length == arrayOfseqNumberRanges.length, @@ -104,7 +106,7 @@ class KinesisBackedBlockRDD( } partition.seqNumberRanges.ranges.iterator.flatMap { range => new KinesisSequenceRangeIterator( - credenentials, endpointUrl, regionId, range, retryTimeoutMs) + credenentials, endpointUrl, regionName, range, retryTimeoutMs) } } if (partition.isBlockIdValid) { diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala new file mode 100644 index 000000000000..2e4204dcb6f1 --- /dev/null +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.kinesis + +import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream + +import org.apache.spark.rdd.RDD +import org.apache.spark.storage.{BlockId, StorageLevel} +import org.apache.spark.streaming.dstream.ReceiverInputDStream +import org.apache.spark.streaming.receiver.Receiver +import org.apache.spark.streaming.scheduler.ReceivedBlockInfo +import org.apache.spark.streaming.{Duration, StreamingContext, Time} + +private[kinesis] class KinesisInputDStream( + @transient _ssc: StreamingContext, + streamName: String, + endpointUrl: String, + regionName: String, + initialPositionInStream: InitialPositionInStream, + checkpointAppName: String, + checkpointInterval: Duration, + storageLevel: StorageLevel, + awsCredentialsOption: Option[SerializableAWSCredentials] + ) extends ReceiverInputDStream[Array[Byte]](_ssc) { + + private[streaming] + override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[Array[Byte]] = { + + // This returns true even for when blockInfos is empty + val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) + + if (allBlocksHaveRanges) { + // Create a KinesisBackedBlockRDD, even when there are no blocks + val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray + val seqNumRanges = blockInfos.map { + _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray + val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray + logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") + new KinesisBackedBlockRDD( + context.sc, regionName, endpointUrl, blockIds, seqNumRanges, + isBlockIdValid = isBlockIdValid, + retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, + awsCredentialsOption = awsCredentialsOption) + } else { + logWarning("Kinesis sequence number information was not present with some block metadata," + + " it may not be possible to recover from failures") + super.createBlockRDD(time, blockInfos) + } + } + + override def getReceiver(): Receiver[Array[Byte]] = { + new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, + checkpointAppName, checkpointInterval, storageLevel, awsCredentialsOption) + } +} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala index 1a8a4cecc114..22324e821ce9 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala @@ -18,17 +18,20 @@ package org.apache.spark.streaming.kinesis import java.util.UUID +import scala.collection.JavaConversions.asScalaIterator +import scala.collection.mutable import scala.util.control.NonFatal -import com.amazonaws.auth.{AWSCredentials, AWSCredentialsProvider, BasicAWSCredentials, DefaultAWSCredentialsProviderChain} +import com.amazonaws.auth.{AWSCredentials, AWSCredentialsProvider, DefaultAWSCredentialsProviderChain} import com.amazonaws.services.kinesis.clientlibrary.interfaces.{IRecordProcessor, IRecordProcessorFactory} import com.amazonaws.services.kinesis.clientlibrary.lib.worker.{InitialPositionInStream, KinesisClientLibConfiguration, Worker} +import com.amazonaws.services.kinesis.model.Record -import org.apache.spark.Logging -import org.apache.spark.storage.StorageLevel +import org.apache.spark.storage.{StorageLevel, StreamBlockId} import org.apache.spark.streaming.Duration -import org.apache.spark.streaming.receiver.Receiver +import org.apache.spark.streaming.receiver.{BlockGenerator, BlockGeneratorListener, Receiver} import org.apache.spark.util.Utils +import org.apache.spark.{Logging, SparkEnv} private[kinesis] @@ -42,38 +45,47 @@ case class SerializableAWSCredentials(accessKeyId: String, secretKey: String) * Custom AWS Kinesis-specific implementation of Spark Streaming's Receiver. * This implementation relies on the Kinesis Client Library (KCL) Worker as described here: * https://github.com/awslabs/amazon-kinesis-client - * This is a custom receiver used with StreamingContext.receiverStream(Receiver) as described here: - * http://spark.apache.org/docs/latest/streaming-custom-receivers.html - * Instances of this class will get shipped to the Spark Streaming Workers to run within a - * Spark Executor. * - * @param appName Kinesis application name. Kinesis Apps are mapped to Kinesis Streams - * by the Kinesis Client Library. If you change the App name or Stream name, - * the KCL will throw errors. This usually requires deleting the backing - * DynamoDB table with the same name this Kinesis application. + * The way this Receiver works is as follows: + * - The receiver starts a KCL Worker, which is essentially runs a threadpool of multiple + * KinesisRecordProcessor + * - Each KinesisRecordProcessor receives data from a Kinesis shard in batches. Each batch is + * inserted into a Block Generator, and the corresponding range of sequence numbers is recorded. + * - When the block generator defines a block, then the recorded sequence number ranges that were + * inserted into the block are recorded separately for being used later. + * - When the block is ready to be pushed, the block is pushed and the ranges are reported as + * metadata of the block. In addition, the ranges are used to find out the latest sequence + * number for each shard that can be checkpointed through the DynamoDB. + * - Periodically, each KinesisRecordProcessor checkpoints the latest successfully stored sequence + * number for it own shard. + * * @param streamName Kinesis stream name * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) * @param regionName Region name used by the Kinesis Client Library for * DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the * worker's initial starting position in the stream. * The values are either the beginning of the stream * per Kinesis' limit of 24 hours * (InitialPositionInStream.TRIM_HORIZON) or * the tip of the stream (InitialPositionInStream.LATEST). + * @param checkpointAppName Kinesis application name. Kinesis Apps are mapped to Kinesis Streams + * by the Kinesis Client Library. If you change the App name or Stream name, + * the KCL will throw errors. This usually requires deleting the backing + * DynamoDB table with the same name this Kinesis application. + * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. + * See the Kinesis Spark Streaming documentation for more + * details on the different types of checkpoints. * @param storageLevel Storage level to use for storing the received objects * @param awsCredentialsOption Optional AWS credentials, used when user directly specifies * the credentials */ private[kinesis] class KinesisReceiver( - appName: String, - streamName: String, + val streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, + checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, awsCredentialsOption: Option[SerializableAWSCredentials] @@ -90,7 +102,7 @@ private[kinesis] class KinesisReceiver( * workerId is used by the KCL should be based on the ip address of the actual Spark Worker * where this code runs (not the driver's IP address.) */ - private var workerId: String = null + @volatile private var workerId: String = null /** * Worker is the core client abstraction from the Kinesis Client Library (KCL). @@ -98,22 +110,40 @@ private[kinesis] class KinesisReceiver( * Each shard is assigned its own IRecordProcessor and the worker run multiple such * processors. */ - private var worker: Worker = null + @volatile private var worker: Worker = null + @volatile private var workerThread: Thread = null - /** Thread running the worker */ - private var workerThread: Thread = null + /** BlockGenerator used to generates blocks out of Kinesis data */ + @volatile private var blockGenerator: BlockGenerator = null + /** + * Sequence number ranges added to the current block being generated. + * Accessing and updating of this map is synchronized by locks in BlockGenerator. + */ + private val seqNumRangesInCurrentBlock = new mutable.ArrayBuffer[SequenceNumberRange] + + /** Sequence number ranges of data added to each generated block */ + private val blockIdToSeqNumRanges = new mutable.HashMap[StreamBlockId, SequenceNumberRanges] + with mutable.SynchronizedMap[StreamBlockId, SequenceNumberRanges] + + /** + * Latest sequence number ranges that have been stored successfully. + * This is used for checkpointing through KCL */ + private val shardIdToLatestStoredSeqNum = new mutable.HashMap[String, String] + with mutable.SynchronizedMap[String, String] /** * This is called when the KinesisReceiver starts and must be non-blocking. * The KCL creates and manages the receiving/processing thread pool through Worker.run(). */ override def onStart() { + blockGenerator = supervisor.createBlockGenerator(new GeneratedBlockHandler) + workerId = Utils.localHostName() + ":" + UUID.randomUUID() // KCL config instance val awsCredProvider = resolveAWSCredentialsProvider() val kinesisClientLibConfiguration = - new KinesisClientLibConfiguration(appName, streamName, awsCredProvider, workerId) + new KinesisClientLibConfiguration(checkpointAppName, streamName, awsCredProvider, workerId) .withKinesisEndpoint(endpointUrl) .withInitialPositionInStream(initialPositionInStream) .withTaskBackoffTimeMillis(500) @@ -141,6 +171,10 @@ private[kinesis] class KinesisReceiver( } } } + + blockIdToSeqNumRanges.clear() + blockGenerator.start() + workerThread.setName(s"Kinesis Receiver ${streamId}") workerThread.setDaemon(true) workerThread.start() @@ -165,6 +199,81 @@ private[kinesis] class KinesisReceiver( workerId = null } + /** Add records of the given shard to the current block being generated */ + private[kinesis] def addRecords(shardId: String, records: java.util.List[Record]): Unit = { + if (records.size > 0) { + val dataIterator = records.iterator().map { record => + val byteBuffer = record.getData() + val byteArray = new Array[Byte](byteBuffer.remaining()) + byteBuffer.get(byteArray) + byteArray + } + val metadata = SequenceNumberRange(streamName, shardId, + records.get(0).getSequenceNumber(), records.get(records.size() - 1).getSequenceNumber()) + blockGenerator.addMultipleDataWithCallback(dataIterator, metadata) + + } + } + + /** Get the latest sequence number for the given shard that can be checkpointed through KCL */ + private[kinesis] def getLatestSeqNumToCheckpoint(shardId: String): Option[String] = { + shardIdToLatestStoredSeqNum.get(shardId) + } + + /** + * Remember the range of sequence numbers that was added to the currently active block. + * Internally, this is synchronized with `finalizeRangesForCurrentBlock()`. + */ + private def rememberAddedRange(range: SequenceNumberRange): Unit = { + seqNumRangesInCurrentBlock += range + } + + /** + * Finalize the ranges added to the block that was active and prepare the ranges buffer + * for next block. Internally, this is synchronized with `rememberAddedRange()`. + */ + private def finalizeRangesForCurrentBlock(blockId: StreamBlockId): Unit = { + blockIdToSeqNumRanges(blockId) = SequenceNumberRanges(seqNumRangesInCurrentBlock.toArray) + seqNumRangesInCurrentBlock.clear() + logDebug(s"Generated block $blockId has $blockIdToSeqNumRanges") + } + + /** Store the block along with its associated ranges */ + private def storeBlockWithRanges( + blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[Array[Byte]]): Unit = { + val rangesToReportOption = blockIdToSeqNumRanges.remove(blockId) + if (rangesToReportOption.isEmpty) { + stop("Error while storing block into Spark, could not find sequence number ranges " + + s"for block $blockId") + return + } + + val rangesToReport = rangesToReportOption.get + var attempt = 0 + var stored = false + var throwable: Throwable = null + while (!stored && attempt <= 3) { + try { + store(arrayBuffer, rangesToReport) + stored = true + } catch { + case NonFatal(th) => + attempt += 1 + throwable = th + } + } + if (!stored) { + stop("Error while storing block into Spark", throwable) + } + + // Update the latest sequence number that have been successfully stored for each shard + // Note that we are doing this sequentially because the array of sequence number ranges + // is assumed to be + rangesToReport.ranges.foreach { range => + shardIdToLatestStoredSeqNum(range.shardId) = range.toSeqNumber + } + } + /** * If AWS credential is provided, return a AWSCredentialProvider returning that credential. * Otherwise, return the DefaultAWSCredentialsProviderChain. @@ -182,4 +291,46 @@ private[kinesis] class KinesisReceiver( new DefaultAWSCredentialsProviderChain() } } + + + /** + * Class to handle blocks generated by this receiver's block generator. Specifically, in + * the context of the Kinesis Receiver, this handler does the following. + * + * - When an array of records is added to the current active block in the block generator, + * this handler keeps track of the corresponding sequence number range. + * - When the currently active block is ready to sealed (not more records), this handler + * keep track of the list of ranges added into this block in another H + */ + private class GeneratedBlockHandler extends BlockGeneratorListener { + + /** + * Callback method called after a data item is added into the BlockGenerator. + * The data addition, block generation, and calls to onAddData and onGenerateBlock + * are all synchronized through the same lock. + */ + def onAddData(data: Any, metadata: Any): Unit = { + rememberAddedRange(metadata.asInstanceOf[SequenceNumberRange]) + } + + /** + * Callback method called after a block has been generated. + * The data addition, block generation, and calls to onAddData and onGenerateBlock + * are all synchronized through the same lock. + */ + def onGenerateBlock(blockId: StreamBlockId): Unit = { + finalizeRangesForCurrentBlock(blockId) + } + + /** Callback method called when a block is ready to be pushed / stored. */ + def onPushBlock(blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[_]): Unit = { + storeBlockWithRanges(blockId, + arrayBuffer.asInstanceOf[mutable.ArrayBuffer[Array[Byte]]]) + } + + /** Callback called in case of any error in internal of the BlockGenerator */ + def onError(message: String, throwable: Throwable): Unit = { + reportError(message, throwable) + } + } } diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala index fe9e3a0c793e..b2405123321e 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala @@ -18,20 +18,16 @@ package org.apache.spark.streaming.kinesis import java.util.List -import scala.collection.JavaConversions.asScalaBuffer import scala.util.Random +import scala.util.control.NonFatal -import org.apache.spark.Logging - -import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException -import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException -import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException -import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException -import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor -import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer +import com.amazonaws.services.kinesis.clientlibrary.exceptions.{InvalidStateException, KinesisClientLibDependencyException, ShutdownException, ThrottlingException} +import com.amazonaws.services.kinesis.clientlibrary.interfaces.{IRecordProcessor, IRecordProcessorCheckpointer} import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason import com.amazonaws.services.kinesis.model.Record +import org.apache.spark.Logging + /** * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor. * This implementation operates on the Array[Byte] from the KinesisReceiver. @@ -51,6 +47,7 @@ private[kinesis] class KinesisRecordProcessor( checkpointState: KinesisCheckpointState) extends IRecordProcessor with Logging { // shardId to be populated during initialize() + @volatile private var shardId: String = _ /** @@ -75,47 +72,38 @@ private[kinesis] class KinesisRecordProcessor( override def processRecords(batch: List[Record], checkpointer: IRecordProcessorCheckpointer) { if (!receiver.isStopped()) { try { - /* - * Notes: - * 1) If we try to store the raw ByteBuffer from record.getData(), the Spark Streaming - * Receiver.store(ByteBuffer) attempts to deserialize the ByteBuffer using the - * internally-configured Spark serializer (kryo, etc). - * 2) This is not desirable, so we instead store a raw Array[Byte] and decouple - * ourselves from Spark's internal serialization strategy. - * 3) For performance, the BlockGenerator is asynchronously queuing elements within its - * memory before creating blocks. This prevents the small block scenario, but requires - * that you register callbacks to know when a block has been generated and stored - * (WAL is sufficient for storage) before can checkpoint back to the source. - */ - batch.foreach(record => receiver.store(record.getData().array())) - - logDebug(s"Stored: Worker $workerId stored ${batch.size} records for shardId $shardId") + receiver.addRecords(shardId, batch) + logDebug(s"Stored: Worker $workerId stored ${batch.size} records for shardId $shardId") /* - * Checkpoint the sequence number of the last record successfully processed/stored - * in the batch. - * In this implementation, we're checkpointing after the given checkpointIntervalMillis. - * Note that this logic requires that processRecords() be called AND that it's time to - * checkpoint. I point this out because there is no background thread running the - * checkpointer. Checkpointing is tested and trigger only when a new batch comes in. - * If the worker is shutdown cleanly, checkpoint will happen (see shutdown() below). - * However, if the worker dies unexpectedly, a checkpoint may not happen. - * This could lead to records being processed more than once. + * + * Checkpoint the sequence number of the last record successfully stored. + * Note that in this current implementation, the checkpointing occurs only when after + * checkpointIntervalMillis from the last checkpoint, AND when there is new record + * to process. This leads to the checkpointing lagging behind what records have been + * stored by the receiver. Ofcourse, this can lead records processed more than once, + * under failures and restarts. + * + * TODO: Instead of checkpointing here, run a separate timer task to perform + * checkpointing so that it checkpoints in a timely manner independent of whether + * new records are available or not. */ if (checkpointState.shouldCheckpoint()) { - /* Perform the checkpoint */ - KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(), 4, 100) + receiver.getLatestSeqNumToCheckpoint(shardId).foreach { latestSeqNum => + /* Perform the checkpoint */ + KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(latestSeqNum), 4, 100) - /* Update the next checkpoint time */ - checkpointState.advanceCheckpoint() + /* Update the next checkpoint time */ + checkpointState.advanceCheckpoint() - logDebug(s"Checkpoint: WorkerId $workerId completed checkpoint of ${batch.size}" + + logDebug(s"Checkpoint: WorkerId $workerId completed checkpoint of ${batch.size}" + s" records for shardId $shardId") - logDebug(s"Checkpoint: Next checkpoint is at " + + logDebug(s"Checkpoint: Next checkpoint is at " + s" ${checkpointState.checkpointClock.getTimeMillis()} for shardId $shardId") + } } } catch { - case e: Throwable => { + case NonFatal(e) => { /* * If there is a failure within the batch, the batch will not be checkpointed. * This will potentially cause records since the last checkpoint to be processed @@ -130,7 +118,7 @@ private[kinesis] class KinesisRecordProcessor( } } else { /* RecordProcessor has been stopped. */ - logInfo(s"Stopped: The Spark KinesisReceiver has stopped for workerId $workerId" + + logInfo(s"Stopped: KinesisReceiver has stopped for workerId $workerId" + s" and shardId $shardId. No more records will be processed.") } } @@ -154,7 +142,11 @@ private[kinesis] class KinesisRecordProcessor( * It's now OK to read from the new shards that resulted from a resharding event. */ case ShutdownReason.TERMINATE => - KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(), 4, 100) + val latestSeqNumToCheckpointOption = receiver.getLatestSeqNumToCheckpoint(shardId) + if (latestSeqNumToCheckpointOption.nonEmpty) { + KinesisRecordProcessor.retryRandom( + checkpointer.checkpoint(latestSeqNumToCheckpointOption.get), 4, 100) + } /* * ZOMBIE Use Case. NoOp. diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala index 255ac27f793b..c8eec13ec7dc 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala @@ -36,22 +36,10 @@ import org.apache.spark.Logging /** * Shared utility methods for performing Kinesis tests that actually transfer data */ -private class KinesisTestUtils(val endpointUrl: String, _regionName: String) extends Logging { - - def this() { - this("https://kinesis.us-west-2.amazonaws.com", "") - } - - def this(endpointUrl: String) { - this(endpointUrl, "") - } - - val regionName = if (_regionName.length == 0) { - RegionUtils.getRegionByEndpoint(endpointUrl).getName() - } else { - RegionUtils.getRegion(_regionName).getName() - } +private[kinesis] class KinesisTestUtils extends Logging { + val endpointUrl = KinesisTestUtils.endpointUrl + val regionName = RegionUtils.getRegionByEndpoint(endpointUrl).getName() val streamShardCount = 2 private val createStreamTimeoutSeconds = 300 @@ -81,11 +69,11 @@ private class KinesisTestUtils(val endpointUrl: String, _regionName: String) ext } def createStream(): Unit = { - logInfo("Creating stream") require(!streamCreated, "Stream already created") _streamName = findNonExistentStreamName() // Create a stream. The number of shards determines the provisioned throughput. + logInfo(s"Creating stream ${_streamName}") val createStreamRequest = new CreateStreamRequest() createStreamRequest.setStreamName(_streamName) createStreamRequest.setShardCount(2) @@ -94,7 +82,7 @@ private class KinesisTestUtils(val endpointUrl: String, _regionName: String) ext // The stream is now being created. Wait for it to become active. waitForStreamToBeActive(_streamName) streamCreated = true - logInfo("Created stream") + logInfo(s"Created stream ${_streamName}") } /** @@ -191,9 +179,38 @@ private class KinesisTestUtils(val endpointUrl: String, _regionName: String) ext private[kinesis] object KinesisTestUtils { - val envVarName = "ENABLE_KINESIS_TESTS" + val envVarNameForEnablingTests = "ENABLE_KINESIS_TESTS" + val endVarNameForEndpoint = "KINESIS_TEST_ENDPOINT_URL" + val defaultEndpointUrl = "https://kinesis.us-west-2.amazonaws.com" + + lazy val shouldRunTests = { + val isEnvSet = sys.env.get(envVarNameForEnablingTests) == Some("1") + if (isEnvSet) { + // scalastyle:off println + // Print this so that they are easily visible on the console and not hidden in the log4j logs. + println( + s""" + |Kinesis tests that actually send data has been enabled by setting the environment + |variable $envVarNameForEnablingTests to 1. This will create Kinesis Streams and + |DynamoDB tables in AWS. Please be aware that this may incur some AWS costs. + |By default, the tests use the endpoint URL $defaultEndpointUrl to create Kinesis streams. + |To change this endpoint URL to a different region, you can set the environment variable + |$endVarNameForEndpoint to the desired endpoint URL + |(e.g. $endVarNameForEndpoint="https://kinesis.us-west-2.amazonaws.com"). + """.stripMargin) + // scalastyle:on println + } + isEnvSet + } - val shouldRunTests = sys.env.get(envVarName) == Some("1") + lazy val endpointUrl = { + val url = sys.env.getOrElse(endVarNameForEndpoint, defaultEndpointUrl) + // scalastyle:off println + // Print this so that they are easily visible on the console and not hidden in the log4j logs. + println(s"Using endpoint URL $url for creating Kinesis streams for tests.") + // scalastyle:on println + url + } def isAWSCredentialsPresent: Boolean = { Try { new DefaultAWSCredentialsProviderChain().getCredentials() }.isSuccess @@ -205,7 +222,13 @@ private[kinesis] object KinesisTestUtils { Try { new DefaultAWSCredentialsProviderChain().getCredentials() } match { case Success(cred) => cred case Failure(e) => - throw new Exception("Kinesis tests enabled, but could get not AWS credentials") + throw new Exception( + s""" + |Kinesis tests enabled using environment variable $envVarNameForEnablingTests + |but could not find AWS credentials. Please follow instructions in AWS documentation + |to set the credentials in your system such that the DefaultAWSCredentialsProviderChain + |can find the credentials. + """.stripMargin) } } } diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala index 7dab17eba848..c799fadf2d5c 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala @@ -65,9 +65,8 @@ object KinesisUtils { ): ReceiverInputDStream[Array[Byte]] = { // Setting scope to override receiver stream's scope of "receiver stream" ssc.withNamedScope("kinesis stream") { - ssc.receiverStream( - new KinesisReceiver(kinesisAppName, streamName, endpointUrl, validateRegion(regionName), - initialPositionInStream, checkpointInterval, storageLevel, None)) + new KinesisInputDStream(ssc, streamName, endpointUrl, validateRegion(regionName), + initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel, None) } } @@ -112,10 +111,11 @@ object KinesisUtils { awsAccessKeyId: String, awsSecretKey: String ): ReceiverInputDStream[Array[Byte]] = { - ssc.receiverStream( - new KinesisReceiver(kinesisAppName, streamName, endpointUrl, validateRegion(regionName), - initialPositionInStream, checkpointInterval, storageLevel, - Some(SerializableAWSCredentials(awsAccessKeyId, awsSecretKey)))) + ssc.withNamedScope("kinesis stream") { + new KinesisInputDStream(ssc, streamName, endpointUrl, validateRegion(regionName), + initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel, + Some(SerializableAWSCredentials(awsAccessKeyId, awsSecretKey))) + } } /** @@ -155,9 +155,10 @@ object KinesisUtils { initialPositionInStream: InitialPositionInStream, storageLevel: StorageLevel ): ReceiverInputDStream[Array[Byte]] = { - ssc.receiverStream( - new KinesisReceiver(ssc.sc.appName, streamName, endpointUrl, getRegionByEndpoint(endpointUrl), - initialPositionInStream, checkpointInterval, storageLevel, None)) + ssc.withNamedScope("kinesis stream") { + new KinesisInputDStream(ssc, streamName, endpointUrl, getRegionByEndpoint(endpointUrl), + initialPositionInStream, ssc.sc.appName, checkpointInterval, storageLevel, None) + } } /** diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala index e81fb11e5959..a89e5627e014 100644 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala +++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala @@ -24,8 +24,6 @@ import org.apache.spark.{SparkConf, SparkContext, SparkException} class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll { - private val regionId = "us-east-1" - private val endpointUrl = "https://kinesis.us-east-1.amazonaws.com" private val testData = 1 to 8 private var testUtils: KinesisTestUtils = null @@ -42,7 +40,7 @@ class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll override def beforeAll(): Unit = { runIfTestsEnabled("Prepare KinesisTestUtils") { - testUtils = new KinesisTestUtils(endpointUrl) + testUtils = new KinesisTestUtils() testUtils.createStream() shardIdToDataAndSeqNumbers = testUtils.pushData(testData) @@ -75,21 +73,21 @@ class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll testIfEnabled("Basic reading from Kinesis") { // Verify all data using multiple ranges in a single RDD partition - val receivedData1 = new KinesisBackedBlockRDD(sc, regionId, endpointUrl, + val receivedData1 = new KinesisBackedBlockRDD(sc, testUtils.regionName, testUtils.endpointUrl, fakeBlockIds(1), Array(SequenceNumberRanges(allRanges.toArray)) ).map { bytes => new String(bytes).toInt }.collect() assert(receivedData1.toSet === testData.toSet) // Verify all data using one range in each of the multiple RDD partitions - val receivedData2 = new KinesisBackedBlockRDD(sc, regionId, endpointUrl, + val receivedData2 = new KinesisBackedBlockRDD(sc, testUtils.regionName, testUtils.endpointUrl, fakeBlockIds(allRanges.size), allRanges.map { range => SequenceNumberRanges(Array(range)) }.toArray ).map { bytes => new String(bytes).toInt }.collect() assert(receivedData2.toSet === testData.toSet) // Verify ordering within each partition - val receivedData3 = new KinesisBackedBlockRDD(sc, regionId, endpointUrl, + val receivedData3 = new KinesisBackedBlockRDD(sc, testUtils.regionName, testUtils.endpointUrl, fakeBlockIds(allRanges.size), allRanges.map { range => SequenceNumberRanges(Array(range)) }.toArray ).map { bytes => new String(bytes).toInt }.collectPartitions() @@ -211,7 +209,8 @@ class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll }, "Incorrect configuration of RDD, unexpected ranges set" ) - val rdd = new KinesisBackedBlockRDD(sc, regionId, endpointUrl, blockIds, ranges) + val rdd = new KinesisBackedBlockRDD( + sc, testUtils.regionName, testUtils.endpointUrl, blockIds, ranges) val collectedData = rdd.map { bytes => new String(bytes).toInt }.collect() @@ -224,8 +223,9 @@ class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll if (testIsBlockValid) { require(numPartitionsInBM === numPartitions, "All partitions must be in BlockManager") require(numPartitionsInKinesis === 0, "No partitions must be in Kinesis") - val rdd2 = new KinesisBackedBlockRDD(sc, regionId, endpointUrl, blockIds.toArray, - ranges, isBlockIdValid = Array.fill(blockIds.length)(false)) + val rdd2 = new KinesisBackedBlockRDD( + sc, testUtils.regionName, testUtils.endpointUrl, blockIds.toArray, ranges, + isBlockIdValid = Array.fill(blockIds.length)(false)) intercept[SparkException] { rdd2.collect() } diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala index 8373138785a8..ee428f31d6ce 100644 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala +++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala @@ -31,7 +31,7 @@ trait KinesisFunSuite extends SparkFunSuite { if (shouldRunTests) { test(testName)(testBody) } else { - ignore(s"$testName [enable by setting env var $envVarName=1]")(testBody) + ignore(s"$testName [enable by setting env var $envVarNameForEnablingTests=1]")(testBody) } } @@ -40,7 +40,7 @@ trait KinesisFunSuite extends SparkFunSuite { if (shouldRunTests) { body } else { - ignore(s"$message [enable by setting env var $envVarName=1]")() + ignore(s"$message [enable by setting env var $envVarNameForEnablingTests=1]")() } } } diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala index 98f2c7c4f1bf..ceb135e0651a 100644 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala +++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala @@ -22,15 +22,14 @@ import scala.collection.JavaConversions.seqAsJavaList import com.amazonaws.services.kinesis.clientlibrary.exceptions.{InvalidStateException, KinesisClientLibDependencyException, ShutdownException, ThrottlingException} import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason import com.amazonaws.services.kinesis.model.Record +import org.mockito.Matchers._ import org.mockito.Mockito._ -import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.mock.MockitoSugar +import org.scalatest.{BeforeAndAfter, Matchers} -import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext, TestSuiteBase} +import org.apache.spark.streaming.{Milliseconds, TestSuiteBase} import org.apache.spark.util.{Clock, ManualClock, Utils} /** @@ -44,6 +43,8 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft val endpoint = "endpoint-url" val workerId = "dummyWorkerId" val shardId = "dummyShardId" + val seqNum = "dummySeqNum" + val someSeqNum = Some(seqNum) val record1 = new Record() record1.setData(ByteBuffer.wrap("Spark In Action".getBytes())) @@ -80,16 +81,18 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft test("process records including store and checkpoint") { when(receiverMock.isStopped()).thenReturn(false) + when(receiverMock.getLatestSeqNumToCheckpoint(shardId)).thenReturn(someSeqNum) when(checkpointStateMock.shouldCheckpoint()).thenReturn(true) val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock) + recordProcessor.initialize(shardId) recordProcessor.processRecords(batch, checkpointerMock) verify(receiverMock, times(1)).isStopped() - verify(receiverMock, times(1)).store(record1.getData().array()) - verify(receiverMock, times(1)).store(record2.getData().array()) + verify(receiverMock, times(1)).addRecords(shardId, batch) + verify(receiverMock, times(1)).getLatestSeqNumToCheckpoint(shardId) verify(checkpointStateMock, times(1)).shouldCheckpoint() - verify(checkpointerMock, times(1)).checkpoint() + verify(checkpointerMock, times(1)).checkpoint(anyString) verify(checkpointStateMock, times(1)).advanceCheckpoint() } @@ -100,19 +103,25 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft recordProcessor.processRecords(batch, checkpointerMock) verify(receiverMock, times(1)).isStopped() + verify(receiverMock, never).addRecords(anyString, anyListOf(classOf[Record])) + verify(checkpointerMock, never).checkpoint(anyString) } test("shouldn't checkpoint when exception occurs during store") { when(receiverMock.isStopped()).thenReturn(false) - when(receiverMock.store(record1.getData().array())).thenThrow(new RuntimeException()) + when( + receiverMock.addRecords(anyString, anyListOf(classOf[Record])) + ).thenThrow(new RuntimeException()) intercept[RuntimeException] { val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock) + recordProcessor.initialize(shardId) recordProcessor.processRecords(batch, checkpointerMock) } verify(receiverMock, times(1)).isStopped() - verify(receiverMock, times(1)).store(record1.getData().array()) + verify(receiverMock, times(1)).addRecords(shardId, batch) + verify(checkpointerMock, never).checkpoint(anyString) } test("should set checkpoint time to currentTime + checkpoint interval upon instantiation") { @@ -158,19 +167,25 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft } test("shutdown should checkpoint if the reason is TERMINATE") { + when(receiverMock.getLatestSeqNumToCheckpoint(shardId)).thenReturn(someSeqNum) + val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock) - val reason = ShutdownReason.TERMINATE - recordProcessor.shutdown(checkpointerMock, reason) + recordProcessor.initialize(shardId) + recordProcessor.shutdown(checkpointerMock, ShutdownReason.TERMINATE) - verify(checkpointerMock, times(1)).checkpoint() + verify(receiverMock, times(1)).getLatestSeqNumToCheckpoint(shardId) + verify(checkpointerMock, times(1)).checkpoint(anyString) } test("shutdown should not checkpoint if the reason is something other than TERMINATE") { + when(receiverMock.getLatestSeqNumToCheckpoint(shardId)).thenReturn(someSeqNum) + val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock) + recordProcessor.initialize(shardId) recordProcessor.shutdown(checkpointerMock, ShutdownReason.ZOMBIE) recordProcessor.shutdown(checkpointerMock, null) - verify(checkpointerMock, never()).checkpoint() + verify(checkpointerMock, never).checkpoint(anyString) } test("retry success on first attempt") { diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala index b88c9c6478d5..1177dc758100 100644 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala +++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala @@ -22,34 +22,67 @@ import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random +import com.amazonaws.regions.RegionUtils import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream +import org.scalatest.Matchers._ import org.scalatest.concurrent.Eventually import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll} -import org.apache.spark.storage.StorageLevel +import org.apache.spark.rdd.RDD +import org.apache.spark.storage.{StorageLevel, StreamBlockId} import org.apache.spark.streaming._ -import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} +import org.apache.spark.streaming.kinesis.KinesisTestUtils._ +import org.apache.spark.streaming.receiver.BlockManagerBasedStoreResult +import org.apache.spark.streaming.scheduler.ReceivedBlockInfo +import org.apache.spark.util.Utils +import org.apache.spark.{SparkConf, SparkContext} class KinesisStreamSuite extends KinesisFunSuite with Eventually with BeforeAndAfter with BeforeAndAfterAll { - // This is the name that KCL uses to save metadata to DynamoDB - private val kinesisAppName = s"KinesisStreamSuite-${math.abs(Random.nextLong())}" + // This is the name that KCL will use to save metadata to DynamoDB + private val appName = s"KinesisStreamSuite-${math.abs(Random.nextLong())}" + private val batchDuration = Seconds(1) - private var ssc: StreamingContext = _ - private var sc: SparkContext = _ + // Dummy parameters for API testing + private val dummyEndpointUrl = defaultEndpointUrl + private val dummyRegionName = RegionUtils.getRegionByEndpoint(dummyEndpointUrl).getName() + private val dummyAWSAccessKey = "dummyAccessKey" + private val dummyAWSSecretKey = "dummySecretKey" + + private var testUtils: KinesisTestUtils = null + private var ssc: StreamingContext = null + private var sc: SparkContext = null override def beforeAll(): Unit = { val conf = new SparkConf() .setMaster("local[4]") .setAppName("KinesisStreamSuite") // Setting Spark app name to Kinesis app name sc = new SparkContext(conf) + + runIfTestsEnabled("Prepare KinesisTestUtils") { + testUtils = new KinesisTestUtils() + testUtils.createStream() + } } override def afterAll(): Unit = { - sc.stop() - // Delete the Kinesis stream as well as the DynamoDB table generated by - // Kinesis Client Library when consuming the stream + if (ssc != null) { + ssc.stop() + } + if (sc != null) { + sc.stop() + } + if (testUtils != null) { + // Delete the Kinesis stream as well as the DynamoDB table generated by + // Kinesis Client Library when consuming the stream + testUtils.deleteStream() + testUtils.deleteDynamoDBTable(appName) + } + } + + before { + ssc = new StreamingContext(sc, batchDuration) } after { @@ -57,21 +90,75 @@ class KinesisStreamSuite extends KinesisFunSuite ssc.stop(stopSparkContext = false) ssc = null } + if (testUtils != null) { + testUtils.deleteDynamoDBTable(appName) + } } test("KinesisUtils API") { - ssc = new StreamingContext(sc, Seconds(1)) // Tests the API, does not actually test data receiving val kinesisStream1 = KinesisUtils.createStream(ssc, "mySparkStream", - "https://kinesis.us-west-2.amazonaws.com", Seconds(2), + dummyEndpointUrl, Seconds(2), InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2) val kinesisStream2 = KinesisUtils.createStream(ssc, "myAppNam", "mySparkStream", - "https://kinesis.us-west-2.amazonaws.com", "us-west-2", + dummyEndpointUrl, dummyRegionName, InitialPositionInStream.LATEST, Seconds(2), StorageLevel.MEMORY_AND_DISK_2) val kinesisStream3 = KinesisUtils.createStream(ssc, "myAppNam", "mySparkStream", - "https://kinesis.us-west-2.amazonaws.com", "us-west-2", + dummyEndpointUrl, dummyRegionName, InitialPositionInStream.LATEST, Seconds(2), StorageLevel.MEMORY_AND_DISK_2, - "awsAccessKey", "awsSecretKey") + dummyAWSAccessKey, dummyAWSSecretKey) + } + + test("RDD generation") { + val inputStream = KinesisUtils.createStream(ssc, appName, "dummyStream", + dummyEndpointUrl, dummyRegionName, InitialPositionInStream.LATEST, Seconds(2), + StorageLevel.MEMORY_AND_DISK_2, dummyAWSAccessKey, dummyAWSSecretKey) + assert(inputStream.isInstanceOf[KinesisInputDStream]) + + val kinesisStream = inputStream.asInstanceOf[KinesisInputDStream] + val time = Time(1000) + + // Generate block info data for testing + val seqNumRanges1 = SequenceNumberRanges( + SequenceNumberRange("fakeStream", "fakeShardId", "xxx", "yyy")) + val blockId1 = StreamBlockId(kinesisStream.id, 123) + val blockInfo1 = ReceivedBlockInfo( + 0, None, Some(seqNumRanges1), new BlockManagerBasedStoreResult(blockId1, None)) + + val seqNumRanges2 = SequenceNumberRanges( + SequenceNumberRange("fakeStream", "fakeShardId", "aaa", "bbb")) + val blockId2 = StreamBlockId(kinesisStream.id, 345) + val blockInfo2 = ReceivedBlockInfo( + 0, None, Some(seqNumRanges2), new BlockManagerBasedStoreResult(blockId2, None)) + + // Verify that the generated KinesisBackedBlockRDD has the all the right information + val blockInfos = Seq(blockInfo1, blockInfo2) + val nonEmptyRDD = kinesisStream.createBlockRDD(time, blockInfos) + nonEmptyRDD shouldBe a [KinesisBackedBlockRDD] + val kinesisRDD = nonEmptyRDD.asInstanceOf[KinesisBackedBlockRDD] + assert(kinesisRDD.regionName === dummyRegionName) + assert(kinesisRDD.endpointUrl === dummyEndpointUrl) + assert(kinesisRDD.retryTimeoutMs === batchDuration.milliseconds) + assert(kinesisRDD.awsCredentialsOption === + Some(SerializableAWSCredentials(dummyAWSAccessKey, dummyAWSSecretKey))) + assert(nonEmptyRDD.partitions.size === blockInfos.size) + nonEmptyRDD.partitions.foreach { _ shouldBe a [KinesisBackedBlockRDDPartition] } + val partitions = nonEmptyRDD.partitions.map { + _.asInstanceOf[KinesisBackedBlockRDDPartition] }.toSeq + assert(partitions.map { _.seqNumberRanges } === Seq(seqNumRanges1, seqNumRanges2)) + assert(partitions.map { _.blockId } === Seq(blockId1, blockId2)) + assert(partitions.forall { _.isBlockIdValid === true }) + + // Verify that KinesisBackedBlockRDD is generated even when there are no blocks + val emptyRDD = kinesisStream.createBlockRDD(time, Seq.empty) + emptyRDD shouldBe a [KinesisBackedBlockRDD] + emptyRDD.partitions shouldBe empty + + // Verify that the KinesisBackedBlockRDD has isBlockValid = false when blocks are invalid + blockInfos.foreach { _.setBlockIdInvalid() } + kinesisStream.createBlockRDD(time, blockInfos).partitions.foreach { partition => + assert(partition.asInstanceOf[KinesisBackedBlockRDDPartition].isBlockIdValid === false) + } } @@ -84,32 +171,91 @@ class KinesisStreamSuite extends KinesisFunSuite * and you have to set the system environment variable RUN_KINESIS_TESTS=1 . */ testIfEnabled("basic operation") { - val kinesisTestUtils = new KinesisTestUtils() - try { - kinesisTestUtils.createStream() - ssc = new StreamingContext(sc, Seconds(1)) - val awsCredentials = KinesisTestUtils.getAWSCredentials() - val stream = KinesisUtils.createStream(ssc, kinesisAppName, kinesisTestUtils.streamName, - kinesisTestUtils.endpointUrl, kinesisTestUtils.regionName, InitialPositionInStream.LATEST, - Seconds(10), StorageLevel.MEMORY_ONLY, - awsCredentials.getAWSAccessKeyId, awsCredentials.getAWSSecretKey) - - val collected = new mutable.HashSet[Int] with mutable.SynchronizedSet[Int] - stream.map { bytes => new String(bytes).toInt }.foreachRDD { rdd => - collected ++= rdd.collect() - logInfo("Collected = " + rdd.collect().toSeq.mkString(", ")) - } - ssc.start() + val awsCredentials = KinesisTestUtils.getAWSCredentials() + val stream = KinesisUtils.createStream(ssc, appName, testUtils.streamName, + testUtils.endpointUrl, testUtils.regionName, InitialPositionInStream.LATEST, + Seconds(10), StorageLevel.MEMORY_ONLY, + awsCredentials.getAWSAccessKeyId, awsCredentials.getAWSSecretKey) - val testData = 1 to 10 - eventually(timeout(120 seconds), interval(10 second)) { - kinesisTestUtils.pushData(testData) - assert(collected === testData.toSet, "\nData received does not match data sent") + val collected = new mutable.HashSet[Int] with mutable.SynchronizedSet[Int] + stream.map { bytes => new String(bytes).toInt }.foreachRDD { rdd => + collected ++= rdd.collect() + logInfo("Collected = " + rdd.collect().toSeq.mkString(", ")) + } + ssc.start() + + val testData = 1 to 10 + eventually(timeout(120 seconds), interval(10 second)) { + testUtils.pushData(testData) + assert(collected === testData.toSet, "\nData received does not match data sent") + } + ssc.stop(stopSparkContext = false) + } + + testIfEnabled("failure recovery") { + val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) + val checkpointDir = Utils.createTempDir().getAbsolutePath + + ssc = new StreamingContext(sc, Milliseconds(1000)) + ssc.checkpoint(checkpointDir) + + val awsCredentials = KinesisTestUtils.getAWSCredentials() + val collectedData = new mutable.HashMap[Time, (Array[SequenceNumberRanges], Seq[Int])] + with mutable.SynchronizedMap[Time, (Array[SequenceNumberRanges], Seq[Int])] + + val kinesisStream = KinesisUtils.createStream(ssc, appName, testUtils.streamName, + testUtils.endpointUrl, testUtils.regionName, InitialPositionInStream.LATEST, + Seconds(10), StorageLevel.MEMORY_ONLY, + awsCredentials.getAWSAccessKeyId, awsCredentials.getAWSSecretKey) + + // Verify that the generated RDDs are KinesisBackedBlockRDDs, and collect the data in each batch + kinesisStream.foreachRDD((rdd: RDD[Array[Byte]], time: Time) => { + val kRdd = rdd.asInstanceOf[KinesisBackedBlockRDD] + val data = rdd.map { bytes => new String(bytes).toInt }.collect().toSeq + collectedData(time) = (kRdd.arrayOfseqNumberRanges, data) + }) + + ssc.remember(Minutes(60)) // remember all the batches so that they are all saved in checkpoint + ssc.start() + + def numBatchesWithData: Int = collectedData.count(_._2._2.nonEmpty) + + def isCheckpointPresent: Boolean = Checkpoint.getCheckpointFiles(checkpointDir).nonEmpty + + // Run until there are at least 10 batches with some data in them + // If this times out because numBatchesWithData is empty, then its likely that foreachRDD + // function failed with exceptions, and nothing got added to `collectedData` + eventually(timeout(2 minutes), interval(1 seconds)) { + testUtils.pushData(1 to 5) + assert(isCheckpointPresent && numBatchesWithData > 10) + } + ssc.stop(stopSparkContext = true) // stop the SparkContext so that the blocks are not reused + + // Restart the context from checkpoint and verify whether the + logInfo("Restarting from checkpoint") + ssc = new StreamingContext(checkpointDir) + ssc.start() + val recoveredKinesisStream = ssc.graph.getInputStreams().head + + // Verify that the recomputed RDDs are KinesisBackedBlockRDDs with the same sequence ranges + // and return the same data + val times = collectedData.keySet + times.foreach { time => + val (arrayOfSeqNumRanges, data) = collectedData(time) + val rdd = recoveredKinesisStream.getOrCompute(time).get.asInstanceOf[RDD[Array[Byte]]] + rdd shouldBe a [KinesisBackedBlockRDD] + + // Verify the recovered sequence ranges + val kRdd = rdd.asInstanceOf[KinesisBackedBlockRDD] + assert(kRdd.arrayOfseqNumberRanges.size === arrayOfSeqNumRanges.size) + arrayOfSeqNumRanges.zip(kRdd.arrayOfseqNumberRanges).foreach { case (expected, found) => + assert(expected.ranges.toSeq === found.ranges.toSeq) } - ssc.stop() - } finally { - kinesisTestUtils.deleteStream() - kinesisTestUtils.deleteDynamoDBTable(kinesisAppName) + + // Verify the recovered data + assert(rdd.map { bytes => new String(bytes).toInt }.collect().toSeq === data) } + ssc.stop() } + } diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml index 478d0019a25f..a44cd8fdc7b9 100644 --- a/extras/spark-ganglia-lgpl/pom.xml +++ b/extras/spark-ganglia-lgpl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index 853dea9a7795..4c1c5604c195 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala index 33ac7b0ed609..7f4e7e9d79d6 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala @@ -87,7 +87,7 @@ class VertexRDDImpl[VD] private[graphx] ( /** The number of vertices in the RDD. */ override def count(): Long = { - partitionsRDD.map(_.size).reduce(_ + _) + partitionsRDD.map(_.size.toLong).reduce(_ + _) } override private[graphx] def mapVertexPartitions[VD2: ClassTag]( diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/LabelPropagation.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/LabelPropagation.scala index 2bcf8684b8b8..a3ad6bed1c99 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/LabelPropagation.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/LabelPropagation.scala @@ -43,7 +43,7 @@ object LabelPropagation { */ def run[VD, ED: ClassTag](graph: Graph[VD, ED], maxSteps: Int): Graph[VertexId, ED] = { val lpaGraph = graph.mapVertices { case (vid, _) => vid } - def sendMessage(e: EdgeTriplet[VertexId, ED]): Iterator[(VertexId, Map[VertexId, VertexId])] = { + def sendMessage(e: EdgeTriplet[VertexId, ED]): Iterator[(VertexId, Map[VertexId, Long])] = { Iterator((e.srcId, Map(e.dstAttr -> 1L)), (e.dstId, Map(e.srcAttr -> 1L))) } def mergeMessage(count1: Map[VertexId, Long], count2: Map[VertexId, Long]) diff --git a/launcher/pom.xml b/launcher/pom.xml index 2fd768d8119c..6a9056893be7 100644 --- a/launcher/pom.xml +++ b/launcher/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml diff --git a/launcher/src/main/java/org/apache/spark/launcher/Main.java b/launcher/src/main/java/org/apache/spark/launcher/Main.java index 62492f9baf3b..a4e3acc674f3 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/Main.java +++ b/launcher/src/main/java/org/apache/spark/launcher/Main.java @@ -32,7 +32,7 @@ class Main { /** * Usage: Main [class] [class args] - *

+ *

* This CLI works in two different modes: *

    *
  • "spark-submit": if class is "org.apache.spark.deploy.SparkSubmit", the @@ -42,7 +42,7 @@ class Main { * * This class works in tandem with the "bin/spark-class" script on Unix-like systems, and * "bin/spark-class2.cmd" batch script on Windows to execute the final command. - *

    + *

    * On Unix-like systems, the output is a list of command arguments, separated by the NULL * character. On Windows, the output is a command line suitable for direct execution from the * script. diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java index 5f95e2c74f90..931a24cfd4b1 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java @@ -28,7 +28,7 @@ /** * Command builder for internal Spark classes. - *

    + *

    * This class handles building the command to launch all internal Spark classes except for * SparkSubmit (which is handled by {@link SparkSubmitCommandBuilder} class. */ diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java index c0f89c923069..57993405e47b 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java @@ -20,12 +20,13 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; import static org.apache.spark.launcher.CommandBuilderUtils.*; -/** +/** * Launcher for Spark applications. *

    * Use this class to start Spark applications programmatically. The class uses a builder pattern @@ -57,7 +58,8 @@ public class SparkLauncher { /** Configuration key for the number of executor CPU cores. */ public static final String EXECUTOR_CORES = "spark.executor.cores"; - private final SparkSubmitCommandBuilder builder; + // Visible for testing. + final SparkSubmitCommandBuilder builder; public SparkLauncher() { this(null); @@ -187,6 +189,73 @@ public SparkLauncher setMainClass(String mainClass) { return this; } + /** + * Adds a no-value argument to the Spark invocation. If the argument is known, this method + * validates whether the argument is indeed a no-value argument, and throws an exception + * otherwise. + *

    + * Use this method with caution. It is possible to create an invalid Spark command by passing + * unknown arguments to this method, since those are allowed for forward compatibility. + * + * @param arg Argument to add. + * @return This launcher. + */ + public SparkLauncher addSparkArg(String arg) { + SparkSubmitOptionParser validator = new ArgumentValidator(false); + validator.parse(Arrays.asList(arg)); + builder.sparkArgs.add(arg); + return this; + } + + /** + * Adds an argument with a value to the Spark invocation. If the argument name corresponds to + * a known argument, the code validates that the argument actually expects a value, and throws + * an exception otherwise. + *

    + * It is safe to add arguments modified by other methods in this class (such as + * {@link #setMaster(String)} - the last invocation will be the one to take effect. + *

    + * Use this method with caution. It is possible to create an invalid Spark command by passing + * unknown arguments to this method, since those are allowed for forward compatibility. + * + * @param name Name of argument to add. + * @param value Value of the argument. + * @return This launcher. + */ + public SparkLauncher addSparkArg(String name, String value) { + SparkSubmitOptionParser validator = new ArgumentValidator(true); + if (validator.MASTER.equals(name)) { + setMaster(value); + } else if (validator.PROPERTIES_FILE.equals(name)) { + setPropertiesFile(value); + } else if (validator.CONF.equals(name)) { + String[] vals = value.split("=", 2); + setConf(vals[0], vals[1]); + } else if (validator.CLASS.equals(name)) { + setMainClass(value); + } else if (validator.JARS.equals(name)) { + builder.jars.clear(); + for (String jar : value.split(",")) { + addJar(jar); + } + } else if (validator.FILES.equals(name)) { + builder.files.clear(); + for (String file : value.split(",")) { + addFile(file); + } + } else if (validator.PY_FILES.equals(name)) { + builder.pyFiles.clear(); + for (String file : value.split(",")) { + addPyFile(file); + } + } else { + validator.parse(Arrays.asList(name, value)); + builder.sparkArgs.add(name); + builder.sparkArgs.add(value); + } + return this; + } + /** * Adds command line arguments for the application. * @@ -277,4 +346,32 @@ public Process launch() throws IOException { return pb.start(); } + private static class ArgumentValidator extends SparkSubmitOptionParser { + + private final boolean hasValue; + + ArgumentValidator(boolean hasValue) { + this.hasValue = hasValue; + } + + @Override + protected boolean handle(String opt, String value) { + if (value == null && hasValue) { + throw new IllegalArgumentException(String.format("'%s' does not expect a value.", opt)); + } + return true; + } + + @Override + protected boolean handleUnknown(String opt) { + // Do not fail on unknown arguments, to support future arguments added to SparkSubmit. + return true; + } + + protected void handleExtraArgs(List extra) { + // No op. + } + + }; + } diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java index 87c43aa9980e..fc87814a59ed 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java @@ -25,11 +25,11 @@ /** * Special command builder for handling a CLI invocation of SparkSubmit. - *

    + *

    * This builder adds command line parsing compatible with SparkSubmit. It handles setting * driver-side options and special parsing behavior needed for the special-casing certain internal * Spark applications. - *

    + *

    * This class has also some special features to aid launching pyspark. */ class SparkSubmitCommandBuilder extends AbstractCommandBuilder { @@ -76,7 +76,7 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder { "spark-internal"); } - private final List sparkArgs; + final List sparkArgs; private final boolean printHelp; /** diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java index b88bba883ac6..6767cc507964 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java @@ -23,7 +23,7 @@ /** * Parser for spark-submit command line options. - *

    + *

    * This class encapsulates the parsing code for spark-submit command line options, so that there * is a single list of options that needs to be maintained (well, sort of, but it makes it harder * to break things). @@ -51,6 +51,7 @@ class SparkSubmitOptionParser { protected final String MASTER = "--master"; protected final String NAME = "--name"; protected final String PACKAGES = "--packages"; + protected final String PACKAGES_EXCLUDE = "--exclude-packages"; protected final String PROPERTIES_FILE = "--properties-file"; protected final String PROXY_USER = "--proxy-user"; protected final String PY_FILES = "--py-files"; @@ -79,10 +80,10 @@ class SparkSubmitOptionParser { * This is the canonical list of spark-submit options. Each entry in the array contains the * different aliases for the same option; the first element of each entry is the "official" * name of the option, passed to {@link #handle(String, String)}. - *

    + *

    * Options not listed here nor in the "switch" list below will result in a call to * {@link $#handleUnknown(String)}. - *

    + *

    * These two arrays are visible for tests. */ final String[][] opts = { @@ -105,6 +106,7 @@ class SparkSubmitOptionParser { { NAME }, { NUM_EXECUTORS }, { PACKAGES }, + { PACKAGES_EXCLUDE }, { PRINCIPAL }, { PROPERTIES_FILE }, { PROXY_USER }, @@ -128,7 +130,7 @@ class SparkSubmitOptionParser { /** * Parse a list of spark-submit command line options. - *

    + *

    * See SparkSubmitArguments.scala for a more formal description of available options. * * @throws IllegalArgumentException If an error is found during parsing. diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/launcher/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java index 252d5abae1ca..d0c26dd05679 100644 --- a/launcher/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java +++ b/launcher/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java @@ -20,6 +20,7 @@ import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; @@ -35,8 +36,54 @@ public class SparkLauncherSuite { private static final Logger LOG = LoggerFactory.getLogger(SparkLauncherSuite.class); + @Test + public void testSparkArgumentHandling() throws Exception { + SparkLauncher launcher = new SparkLauncher() + .setSparkHome(System.getProperty("spark.test.home")); + SparkSubmitOptionParser opts = new SparkSubmitOptionParser(); + + launcher.addSparkArg(opts.HELP); + try { + launcher.addSparkArg(opts.PROXY_USER); + fail("Expected IllegalArgumentException."); + } catch (IllegalArgumentException e) { + // Expected. + } + + launcher.addSparkArg(opts.PROXY_USER, "someUser"); + try { + launcher.addSparkArg(opts.HELP, "someValue"); + fail("Expected IllegalArgumentException."); + } catch (IllegalArgumentException e) { + // Expected. + } + + launcher.addSparkArg("--future-argument"); + launcher.addSparkArg("--future-argument", "someValue"); + + launcher.addSparkArg(opts.MASTER, "myMaster"); + assertEquals("myMaster", launcher.builder.master); + + launcher.addJar("foo"); + launcher.addSparkArg(opts.JARS, "bar"); + assertEquals(Arrays.asList("bar"), launcher.builder.jars); + + launcher.addFile("foo"); + launcher.addSparkArg(opts.FILES, "bar"); + assertEquals(Arrays.asList("bar"), launcher.builder.files); + + launcher.addPyFile("foo"); + launcher.addSparkArg(opts.PY_FILES, "bar"); + assertEquals(Arrays.asList("bar"), launcher.builder.pyFiles); + + launcher.setConf("spark.foo", "foo"); + launcher.addSparkArg(opts.CONF, "spark.foo=bar"); + assertEquals("bar", launcher.builder.conf.get("spark.foo")); + } + @Test public void testChildProcLauncher() throws Exception { + SparkSubmitOptionParser opts = new SparkSubmitOptionParser(); Map env = new HashMap(); env.put("SPARK_PRINT_LAUNCH_COMMAND", "1"); @@ -44,9 +91,12 @@ public void testChildProcLauncher() throws Exception { .setSparkHome(System.getProperty("spark.test.home")) .setMaster("local") .setAppResource("spark-internal") + .addSparkArg(opts.CONF, + String.format("%s=-Dfoo=ShouldBeOverriddenBelow", SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS)) .setConf(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, "-Dfoo=bar -Dtest.name=-testChildProcLauncher") .setConf(SparkLauncher.DRIVER_EXTRA_CLASSPATH, System.getProperty("java.class.path")) + .addSparkArg(opts.CLASS, "ShouldBeOverriddenBelow") .setMainClass(SparkLauncherTestApp.class.getName()) .addAppArgs("proc"); final Process app = launcher.launch(); diff --git a/make-distribution.sh b/make-distribution.sh index 4789b0e09cc8..04ad0052eb24 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -33,7 +33,7 @@ SPARK_HOME="$(cd "`dirname "$0"`"; pwd)" DISTDIR="$SPARK_HOME/dist" SPARK_TACHYON=false -TACHYON_VERSION="0.7.0" +TACHYON_VERSION="0.7.1" TACHYON_TGZ="tachyon-${TACHYON_VERSION}-bin.tar.gz" TACHYON_URL="https://github.com/amplab/tachyon/releases/download/v${TACHYON_VERSION}/${TACHYON_TGZ}" @@ -219,7 +219,6 @@ cp -r "$SPARK_HOME/ec2" "$DISTDIR" if [ -d "$SPARK_HOME"/R/lib/SparkR ]; then mkdir -p "$DISTDIR"/R/lib cp -r "$SPARK_HOME/R/lib/SparkR" "$DISTDIR"/R/lib - cp "$SPARK_HOME/R/lib/sparkr.zip" "$DISTDIR"/R/lib fi # Download and copy in tachyon, if requested diff --git a/mllib/pom.xml b/mllib/pom.xml index a5db14407b4f..d1c22f6dba2a 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index aef2c019d287..a3e59401c5cf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -198,6 +198,6 @@ class PipelineModel private[ml] ( } override def copy(extra: ParamMap): PipelineModel = { - new PipelineModel(uid, stages.map(_.copy(extra))) + new PipelineModel(uid, stages.map(_.copy(extra))).setParent(parent) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala index 581d8fa7749b..45df557a8990 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala @@ -18,14 +18,13 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.{PredictionModel, PredictorParams, Predictor} import org.apache.spark.ml.param.shared.HasRawPredictionCol import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{DataType, DoubleType, StructType} +import org.apache.spark.sql.types.{DataType, StructType} /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index f2b992f8ba24..6f70b96b17ec 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -117,7 +117,7 @@ final class DecisionTreeClassificationModel private[ml] ( * Construct a decision tree classification model. * @param rootNode Root node of tree, with other nodes attached. */ - def this(rootNode: Node, numClasses: Int) = + private[ml] def this(rootNode: Node, numClasses: Int) = this(Identifiable.randomUID("dtc"), rootNode, numClasses) override protected def predict(features: Vector): Double = { @@ -141,6 +141,7 @@ final class DecisionTreeClassificationModel private[ml] ( override def copy(extra: ParamMap): DecisionTreeClassificationModel = { copyValues(new DecisionTreeClassificationModel(uid, rootNode, numClasses), extra) + .setParent(parent) } override def toString: String = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index c3891a959926..3073a2a61ce8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -196,7 +196,7 @@ final class GBTClassificationModel( } override def copy(extra: ParamMap): GBTClassificationModel = { - copyValues(new GBTClassificationModel(uid, _trees, _treeWeights), extra) + copyValues(new GBTClassificationModel(uid, _trees, _treeWeights), extra).setParent(parent) } override def toString: String = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 8fc9199fb460..21fbe38ca823 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -30,10 +30,11 @@ import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.linalg.BLAS._ import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel /** @@ -41,12 +42,115 @@ import org.apache.spark.storage.StorageLevel */ private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol - with HasThreshold with HasStandardization + with HasStandardization with HasThreshold { + + /** + * Set threshold in binary classification, in range [0, 1]. + * + * If the estimated probability of class label 1 is > threshold, then predict 1, else 0. + * A high threshold encourages the model to predict 0 more often; + * a low threshold encourages the model to predict 1 more often. + * + * Note: Calling this with threshold p is equivalent to calling `setThresholds(Array(1-p, p))`. + * When [[setThreshold()]] is called, any user-set value for [[thresholds]] will be cleared. + * If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be + * equivalent. + * + * Default is 0.5. + * @group setParam + */ + def setThreshold(value: Double): this.type = { + if (isSet(thresholds)) clear(thresholds) + set(threshold, value) + } + + /** + * Get threshold for binary classification. + * + * If [[threshold]] is set, returns that value. + * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary classification), + * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. + * Otherwise, returns [[threshold]] default value. + * + * @group getParam + * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2. + */ + override def getThreshold: Double = { + checkThresholdConsistency() + if (isSet(thresholds)) { + val ts = $(thresholds) + require(ts.length == 2, "Logistic Regression getThreshold only applies to" + + " binary classification, but thresholds has length != 2. thresholds: " + ts.mkString(",")) + 1.0 / (1.0 + ts(0) / ts(1)) + } else { + $(threshold) + } + } + + /** + * Set thresholds in multiclass (or binary) classification to adjust the probability of + * predicting each class. Array must have length equal to the number of classes, with values >= 0. + * The class with largest value p/t is predicted, where p is the original probability of that + * class and t is the class' threshold. + * + * Note: When [[setThresholds()]] is called, any user-set value for [[threshold]] will be cleared. + * If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be + * equivalent. + * + * @group setParam + */ + def setThresholds(value: Array[Double]): this.type = { + if (isSet(threshold)) clear(threshold) + set(thresholds, value) + } + + /** + * Get thresholds for binary or multiclass classification. + * + * If [[thresholds]] is set, return its value. + * Otherwise, if [[threshold]] is set, return the equivalent thresholds for binary + * classification: (1-threshold, threshold). + * If neither are set, throw an exception. + * + * @group getParam + */ + override def getThresholds: Array[Double] = { + checkThresholdConsistency() + if (!isSet(thresholds) && isSet(threshold)) { + val t = $(threshold) + Array(1-t, t) + } else { + $(thresholds) + } + } + + /** + * If [[threshold]] and [[thresholds]] are both set, ensures they are consistent. + * @throws IllegalArgumentException if [[threshold]] and [[thresholds]] are not equivalent + */ + protected def checkThresholdConsistency(): Unit = { + if (isSet(threshold) && isSet(thresholds)) { + val ts = $(thresholds) + require(ts.length == 2, "Logistic Regression found inconsistent values for threshold and" + + s" thresholds. Param threshold is set (${$(threshold)}), indicating binary" + + s" classification, but Param thresholds is set with length ${ts.length}." + + " Clear one Param value to fix this problem.") + val t = 1.0 / (1.0 + ts(0) / ts(1)) + require(math.abs($(threshold) - t) < 1E-5, "Logistic Regression getThreshold found" + + s" inconsistent values for threshold (${$(threshold)}) and thresholds (equivalent to $t)") + } + } + + override def validateParams(): Unit = { + checkThresholdConsistency() + } +} /** * :: Experimental :: * Logistic regression. - * Currently, this class only supports binary classification. + * Currently, this class only supports binary classification. It will support multiclass + * in the future. */ @Experimental class LogisticRegression(override val uid: String) @@ -94,25 +198,29 @@ class LogisticRegression(override val uid: String) * Whether to fit an intercept term. * Default is true. * @group setParam - * */ + */ def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) setDefault(fitIntercept -> true) /** * Whether to standardize the training features before fitting the model. * The coefficients of models will be always returned on the original scale, - * so it will be transparent for users. Note that when no regularization, - * with or without standardization, the models should be always converged to - * the same solution. + * so it will be transparent for users. Note that with/without standardization, + * the models should be always converged to the same solution when no regularization + * is applied. In R's GLMNET package, the default behavior is true as well. * Default is true. * @group setParam - * */ + */ def setStandardization(value: Boolean): this.type = set(standardization, value) setDefault(standardization -> true) - /** @group setParam */ - def setThreshold(value: Double): this.type = set(threshold, value) - setDefault(threshold -> 0.5) + override def setThreshold(value: Double): this.type = super.setThreshold(value) + + override def getThreshold: Double = super.getThreshold + + override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value) + + override def getThresholds: Array[Double] = super.getThresholds override protected def train(dataset: DataFrame): LogisticRegressionModel = { // Extract columns from data. If dataset is persisted, do not persist oldDataset. @@ -252,7 +360,13 @@ class LogisticRegression(override val uid: String) if (handlePersistence) instances.unpersist() - copyValues(new LogisticRegressionModel(uid, weights, intercept)) + val model = copyValues(new LogisticRegressionModel(uid, weights, intercept)) + val logRegSummary = new BinaryLogisticRegressionTrainingSummary( + model.transform(dataset), + $(probabilityCol), + $(labelCol), + objectiveHistory) + model.setSummary(logRegSummary) } override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra) @@ -270,8 +384,13 @@ class LogisticRegressionModel private[ml] ( extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel] with LogisticRegressionParams { - /** @group setParam */ - def setThreshold(value: Double): this.type = set(threshold, value) + override def setThreshold(value: Double): this.type = super.setThreshold(value) + + override def getThreshold: Double = super.getThreshold + + override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value) + + override def getThresholds: Array[Double] = super.getThresholds /** Margin (rawPrediction) for class label 1. For binary classification only. */ private val margin: Vector => Double = (features) => { @@ -286,11 +405,44 @@ class LogisticRegressionModel private[ml] ( override val numClasses: Int = 2 + private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None + + /** + * Gets summary of model on training set. An exception is + * thrown if `trainingSummary == None`. + */ + def summary: LogisticRegressionTrainingSummary = trainingSummary match { + case Some(summ) => summ + case None => + throw new SparkException( + "No training summary available for this LogisticRegressionModel", + new NullPointerException()) + } + + private[classification] def setSummary( + summary: LogisticRegressionTrainingSummary): this.type = { + this.trainingSummary = Some(summary) + this + } + + /** Indicates whether a training summary exists for this model instance. */ + def hasSummary: Boolean = trainingSummary.isDefined + + /** + * Evaluates the model on a testset. + * @param dataset Test dataset to evaluate model on. + */ + // TODO: decide on a good name before exposing to public API + private[classification] def evaluate(dataset: DataFrame): LogisticRegressionSummary = { + new BinaryLogisticRegressionSummary(this.transform(dataset), $(probabilityCol), $(labelCol)) + } + /** * Predict label for the given feature vector. - * The behavior of this can be adjusted using [[threshold]]. + * The behavior of this can be adjusted using [[thresholds]]. */ override protected def predict(features: Vector): Double = { + // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. if (score(features) > getThreshold) 1 else 0 } @@ -316,10 +468,11 @@ class LogisticRegressionModel private[ml] ( } override def copy(extra: ParamMap): LogisticRegressionModel = { - copyValues(new LogisticRegressionModel(uid, weights, intercept), extra) + copyValues(new LogisticRegressionModel(uid, weights, intercept), extra).setParent(parent) } override protected def raw2prediction(rawPrediction: Vector): Double = { + // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. val t = getThreshold val rawThreshold = if (t == 0.0) { Double.NegativeInfinity @@ -332,6 +485,7 @@ class LogisticRegressionModel private[ml] ( } override protected def probability2prediction(probability: Vector): Double = { + // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. if (probability(1) > getThreshold) 1 else 0 } } @@ -407,6 +561,128 @@ private[classification] class MultiClassSummarizer extends Serializable { } } +/** + * Abstraction for multinomial Logistic Regression Training results. + */ +sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary { + + /** objective function (scaled loss + regularization) at each iteration. */ + def objectiveHistory: Array[Double] + + /** Number of training iterations until termination */ + def totalIterations: Int = objectiveHistory.length + +} + +/** + * Abstraction for Logistic Regression Results for a given model. + */ +sealed trait LogisticRegressionSummary extends Serializable { + + /** Dataframe outputted by the model's `transform` method. */ + def predictions: DataFrame + + /** Field in "predictions" which gives the calibrated probability of each sample as a vector. */ + def probabilityCol: String + + /** Field in "predictions" which gives the the true label of each sample. */ + def labelCol: String + +} + +/** + * :: Experimental :: + * Logistic regression training results. + * @param predictions dataframe outputted by the model's `transform` method. + * @param probabilityCol field in "predictions" which gives the calibrated probability of + * each sample as a vector. + * @param labelCol field in "predictions" which gives the true label of each sample. + * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. + */ +@Experimental +class BinaryLogisticRegressionTrainingSummary private[classification] ( + predictions: DataFrame, + probabilityCol: String, + labelCol: String, + val objectiveHistory: Array[Double]) + extends BinaryLogisticRegressionSummary(predictions, probabilityCol, labelCol) + with LogisticRegressionTrainingSummary { + +} + +/** + * :: Experimental :: + * Binary Logistic regression results for a given model. + * @param predictions dataframe outputted by the model's `transform` method. + * @param probabilityCol field in "predictions" which gives the calibrated probability of + * each sample. + * @param labelCol field in "predictions" which gives the true label of each sample. + */ +@Experimental +class BinaryLogisticRegressionSummary private[classification] ( + @transient override val predictions: DataFrame, + override val probabilityCol: String, + override val labelCol: String) extends LogisticRegressionSummary { + + private val sqlContext = predictions.sqlContext + import sqlContext.implicits._ + + /** + * Returns a BinaryClassificationMetrics object. + */ + // TODO: Allow the user to vary the number of bins using a setBins method in + // BinaryClassificationMetrics. For now the default is set to 100. + @transient private val binaryMetrics = new BinaryClassificationMetrics( + predictions.select(probabilityCol, labelCol).map { + case Row(score: Vector, label: Double) => (score(1), label) + }, 100 + ) + + /** + * Returns the receiver operating characteristic (ROC) curve, + * which is an Dataframe having two fields (FPR, TPR) + * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. + * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic + */ + @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR") + + /** + * Computes the area under the receiver operating characteristic (ROC) curve. + */ + lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC() + + /** + * Returns the precision-recall curve, which is an Dataframe containing + * two fields recall, precision with (0.0, 1.0) prepended to it. + */ + @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", "precision") + + /** + * Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0. + */ + @transient lazy val fMeasureByThreshold: DataFrame = { + binaryMetrics.fMeasureByThreshold().toDF("threshold", "F-Measure") + } + + /** + * Returns a dataframe with two fields (threshold, precision) curve. + * Every possible probability obtained in transforming the dataset are used + * as thresholds used in calculating the precision. + */ + @transient lazy val precisionByThreshold: DataFrame = { + binaryMetrics.precisionByThreshold().toDF("threshold", "precision") + } + + /** + * Returns a dataframe with two fields (threshold, recall) curve. + * Every possible probability obtained in transforming the dataset are used + * as thresholds used in calculating the recall. + */ + @transient lazy val recallByThreshold: DataFrame = { + binaryMetrics.recallByThreshold().toDF("threshold", "recall") + } +} + /** * LogisticAggregator computes the gradient and loss for binary logistic loss function, as used * in binary classification for samples in sparse or dense vector in a online fashion. diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 8cd2103d7d5e..82fc80c58054 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -32,6 +32,7 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams with HasSeed with HasMaxIter with HasTol { /** * Layer sizes including input size and output size. + * Default: Array(1, 1) * @group param */ final val layers: IntArrayParam = new IntArrayParam(this, "layers", @@ -42,9 +43,6 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams ParamValidators.arrayLengthGt(1) ) - /** @group setParam */ - def setLayers(value: Array[Int]): this.type = set(layers, value) - /** @group getParam */ final def getLayers: Array[Int] = $(layers) @@ -53,6 +51,7 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams * Data is stacked within partitions. If block size is more than remaining data in * a partition then it is adjusted to the size of this data. * Recommended size is between 10 and 1000. + * Default: 128 * @group expertParam */ final val blockSize: IntParam = new IntParam(this, "blockSize", @@ -61,33 +60,9 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams "it is adjusted to the size of this data. Recommended size is between 10 and 1000", ParamValidators.gt(0)) - /** @group setParam */ - def setBlockSize(value: Int): this.type = set(blockSize, value) - /** @group getParam */ final def getBlockSize: Int = $(blockSize) - /** - * Set the maximum number of iterations. - * Default is 100. - * @group setParam - */ - def setMaxIter(value: Int): this.type = set(maxIter, value) - - /** - * Set the convergence tolerance of iterations. - * Smaller value will lead to higher accuracy with the cost of more iterations. - * Default is 1E-4. - * @group setParam - */ - def setTol(value: Double): this.type = set(tol, value) - - /** - * Set the seed for weights initialization. - * @group setParam - */ - def setSeed(value: Long): this.type = set(seed, value) - setDefault(maxIter -> 100, tol -> 1e-4, layers -> Array(1, 1), blockSize -> 128) } @@ -131,11 +106,38 @@ private object LabelConverter { */ @Experimental class MultilayerPerceptronClassifier(override val uid: String) - extends Predictor[Vector, MultilayerPerceptronClassifier, MultilayerPerceptronClassifierModel] + extends Predictor[Vector, MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel] with MultilayerPerceptronParams { def this() = this(Identifiable.randomUID("mlpc")) + /** @group setParam */ + def setLayers(value: Array[Int]): this.type = set(layers, value) + + /** @group setParam */ + def setBlockSize(value: Int): this.type = set(blockSize, value) + + /** + * Set the maximum number of iterations. + * Default is 100. + * @group setParam + */ + def setMaxIter(value: Int): this.type = set(maxIter, value) + + /** + * Set the convergence tolerance of iterations. + * Smaller value will lead to higher accuracy with the cost of more iterations. + * Default is 1E-4. + * @group setParam + */ + def setTol(value: Double): this.type = set(tol, value) + + /** + * Set the seed for weights initialization. + * @group setParam + */ + def setSeed(value: Long): this.type = set(seed, value) + override def copy(extra: ParamMap): MultilayerPerceptronClassifier = defaultCopy(extra) /** @@ -146,7 +148,7 @@ class MultilayerPerceptronClassifier(override val uid: String) * @param dataset Training dataset * @return Fitted model */ - override protected def train(dataset: DataFrame): MultilayerPerceptronClassifierModel = { + override protected def train(dataset: DataFrame): MultilayerPerceptronClassificationModel = { val myLayers = $(layers) val labels = myLayers.last val lpData = extractLabeledPoints(dataset) @@ -156,13 +158,13 @@ class MultilayerPerceptronClassifier(override val uid: String) FeedForwardTrainer.LBFGSOptimizer.setConvergenceTol($(tol)).setNumIterations($(maxIter)) FeedForwardTrainer.setStackSize($(blockSize)) val mlpModel = FeedForwardTrainer.train(data) - new MultilayerPerceptronClassifierModel(uid, myLayers, mlpModel.weights()) + new MultilayerPerceptronClassificationModel(uid, myLayers, mlpModel.weights()) } } /** * :: Experimental :: - * Classifier model based on the Multilayer Perceptron. + * Classification model based on the Multilayer Perceptron. * Each layer has sigmoid activation function, output layer has softmax. * @param uid uid * @param layers array of layer sizes including input and output layers @@ -170,11 +172,11 @@ class MultilayerPerceptronClassifier(override val uid: String) * @return prediction model */ @Experimental -class MultilayerPerceptronClassifierModel private[ml] ( +class MultilayerPerceptronClassificationModel private[ml] ( override val uid: String, - layers: Array[Int], - weights: Vector) - extends PredictionModel[Vector, MultilayerPerceptronClassifierModel] + val layers: Array[Int], + val weights: Vector) + extends PredictionModel[Vector, MultilayerPerceptronClassificationModel] with Serializable { private val mlpModel = FeedForwardTopology.multiLayerPerceptron(layers, true).getInstance(weights) @@ -187,7 +189,7 @@ class MultilayerPerceptronClassifierModel private[ml] ( LabelConverter.decodeLabel(mlpModel.predict(features)) } - override def copy(extra: ParamMap): MultilayerPerceptronClassifierModel = { - copyValues(new MultilayerPerceptronClassifierModel(uid, layers, weights), extra) + override def copy(extra: ParamMap): MultilayerPerceptronClassificationModel = { + copyValues(new MultilayerPerceptronClassificationModel(uid, layers, weights), extra) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index b46b676204e0..69cb88a7e671 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -18,11 +18,11 @@ package org.apache.spark.ml.classification import org.apache.spark.SparkException -import org.apache.spark.ml.{PredictorParams, PredictionModel, Predictor} -import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param, DoubleParam} +import org.apache.spark.annotation.Experimental +import org.apache.spark.ml.PredictorParams +import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.util.Identifiable -import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes} -import org.apache.spark.mllib.classification.{NaiveBayesModel => OldNaiveBayesModel} +import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes, NaiveBayesModel => OldNaiveBayesModel} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @@ -59,6 +59,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams { } /** + * :: Experimental :: * Naive Bayes Classifiers. * It supports both Multinomial NB * ([[http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html]]) @@ -68,6 +69,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams { * ([[http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html]]). * The input feature values must be nonnegative. */ +@Experimental class NaiveBayes(override val uid: String) extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel] with NaiveBayesParams { @@ -86,6 +88,7 @@ class NaiveBayes(override val uid: String) * Set the model type using a string (case-sensitive). * Supported options: "multinomial" and "bernoulli". * Default is "multinomial" + * @group setParam */ def setModelType(value: String): this.type = set(modelType, value) setDefault(modelType -> OldNaiveBayes.Multinomial) @@ -100,8 +103,13 @@ class NaiveBayes(override val uid: String) } /** + * :: Experimental :: * Model produced by [[NaiveBayes]] + * @param pi log of class priors, whose dimension is C (number of classes) + * @param theta log of class conditional probabilities, whose dimension is C (number of classes) + * by D (number of features) */ +@Experimental class NaiveBayesModel private[ml] ( override val uid: String, val pi: Vector, diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala index 1741f19dc911..1132d8046df6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala @@ -138,7 +138,7 @@ final class OneVsRestModel private[ml] ( override def copy(extra: ParamMap): OneVsRestModel = { val copied = new OneVsRestModel( uid, labelMetadata, models.map(_.copy(extra).asInstanceOf[ClassificationModel[_, _]])) - copyValues(copied, extra) + copyValues(copied, extra).setParent(parent) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala index f9c9c2371f5c..fdd1851ae550 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala @@ -20,17 +20,16 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.SchemaUtils -import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, VectorUDT} +import org.apache.spark.mllib.linalg.{DenseVector, Vector, VectorUDT, Vectors} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{DoubleType, DataType, StructType} +import org.apache.spark.sql.types.{DataType, StructType} /** * (private[classification]) Params for probabilistic classification. */ private[classification] trait ProbabilisticClassifierParams - extends ClassifierParams with HasProbabilityCol { - + extends ClassifierParams with HasProbabilityCol with HasThresholds { override protected def validateAndTransformSchema( schema: StructType, fitting: Boolean, @@ -51,7 +50,7 @@ private[classification] trait ProbabilisticClassifierParams * @tparam M Concrete Model type */ @DeveloperApi -private[spark] abstract class ProbabilisticClassifier[ +abstract class ProbabilisticClassifier[ FeaturesType, E <: ProbabilisticClassifier[FeaturesType, E, M], M <: ProbabilisticClassificationModel[FeaturesType, M]] @@ -59,6 +58,9 @@ private[spark] abstract class ProbabilisticClassifier[ /** @group setParam */ def setProbabilityCol(value: String): E = set(probabilityCol, value).asInstanceOf[E] + + /** @group setParam */ + def setThresholds(value: Array[Double]): E = set(thresholds, value).asInstanceOf[E] } @@ -72,7 +74,7 @@ private[spark] abstract class ProbabilisticClassifier[ * @tparam M Concrete Model type */ @DeveloperApi -private[spark] abstract class ProbabilisticClassificationModel[ +abstract class ProbabilisticClassificationModel[ FeaturesType, M <: ProbabilisticClassificationModel[FeaturesType, M]] extends ClassificationModel[FeaturesType, M] with ProbabilisticClassifierParams { @@ -80,6 +82,9 @@ private[spark] abstract class ProbabilisticClassificationModel[ /** @group setParam */ def setProbabilityCol(value: String): M = set(probabilityCol, value).asInstanceOf[M] + /** @group setParam */ + def setThresholds(value: Array[Double]): M = set(thresholds, value).asInstanceOf[M] + /** * Transforms dataset by reading from [[featuresCol]], and appending new columns as specified by * parameters: @@ -92,6 +97,11 @@ private[spark] abstract class ProbabilisticClassificationModel[ */ override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) + if (isDefined(thresholds)) { + require($(thresholds).length == numClasses, this.getClass.getSimpleName + + ".transform() called with non-matching numClasses and thresholds.length." + + s" numClasses=$numClasses, but thresholds has length ${$(thresholds).length}") + } // Output selected columns only. // This is a bit complicated since it tries to avoid repeated computation. @@ -155,6 +165,14 @@ private[spark] abstract class ProbabilisticClassificationModel[ raw2probabilityInPlace(probs) } + override protected def raw2prediction(rawPrediction: Vector): Double = { + if (!isDefined(thresholds)) { + rawPrediction.argmax + } else { + probability2prediction(raw2probability(rawPrediction)) + } + } + /** * Predict the probability of each class given the features. * These predictions are also called class conditional probabilities. @@ -170,10 +188,21 @@ private[spark] abstract class ProbabilisticClassificationModel[ /** * Given a vector of class conditional probabilities, select the predicted label. - * This may be overridden to support thresholds which favor particular labels. + * This supports thresholds which favor particular labels. * @return predicted label */ - protected def probability2prediction(probability: Vector): Double = probability.argmax + protected def probability2prediction(probability: Vector): Double = { + if (!isDefined(thresholds)) { + probability.argmax + } else { + val thresholds: Array[Double] = getThresholds + val scaledProbability: Array[Double] = + probability.toArray.zip(thresholds).map { case (p, t) => + if (t == 0.0) Double.PositiveInfinity else p / t + } + Vectors.dense(scaledProbability).argmax + } + } } private[ml] object ProbabilisticClassificationModel { diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 56e80cc8fe6e..11a6d7246833 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -95,7 +95,8 @@ final class RandomForestClassifier(override val uid: String) val trees = RandomForest.run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed) .map(_.asInstanceOf[DecisionTreeClassificationModel]) - new RandomForestClassificationModel(trees, numClasses) + val numFeatures = oldDataset.first().features.size + new RandomForestClassificationModel(trees, numFeatures, numClasses) } override def copy(extra: ParamMap): RandomForestClassifier = defaultCopy(extra) @@ -118,11 +119,13 @@ object RandomForestClassifier { * features. * @param _trees Decision trees in the ensemble. * Warning: These have null parents. + * @param numFeatures Number of features used by this model */ @Experimental final class RandomForestClassificationModel private[ml] ( override val uid: String, private val _trees: Array[DecisionTreeClassificationModel], + val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, RandomForestClassificationModel] with TreeEnsembleModel with Serializable { @@ -133,8 +136,11 @@ final class RandomForestClassificationModel private[ml] ( * Construct a random forest classification model, with all trees weighted equally. * @param trees Component trees */ - def this(trees: Array[DecisionTreeClassificationModel], numClasses: Int) = - this(Identifiable.randomUID("rfc"), trees, numClasses) + private[ml] def this( + trees: Array[DecisionTreeClassificationModel], + numFeatures: Int, + numClasses: Int) = + this(Identifiable.randomUID("rfc"), trees, numFeatures, numClasses) override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]] @@ -182,13 +188,31 @@ final class RandomForestClassificationModel private[ml] ( } override def copy(extra: ParamMap): RandomForestClassificationModel = { - copyValues(new RandomForestClassificationModel(uid, _trees, numClasses), extra) + copyValues(new RandomForestClassificationModel(uid, _trees, numFeatures, numClasses), extra) + .setParent(parent) } override def toString: String = { s"RandomForestClassificationModel with $numTrees trees" } + /** + * Estimate of the importance of each feature. + * + * This generalizes the idea of "Gini" importance to other losses, + * following the explanation of Gini importance from "Random Forests" documentation + * by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn. + * + * This feature importance is calculated as follows: + * - Average over trees: + * - importance(feature j) = sum (over nodes which split on feature j) of the gain, + * where gain is scaled by the number of instances passing through node + * - Normalize importances for tree based on total number of training instances used + * to build tree. + * - Normalize feature importance vector to sum to 1. + */ + lazy val featureImportances: Vector = RandomForest.featureImportances(trees, numFeatures) + /** (private[ml]) Convert to a model in the old API */ private[ml] def toOld: OldRandomForestModel = { new OldRandomForestModel(OldAlgo.Classification, _trees.map(_.toOld)) @@ -210,6 +234,6 @@ private[ml] object RandomForestClassificationModel { DecisionTreeClassificationModel.fromOld(tree, null, categoricalFeatures) } val uid = if (parent != null) parent.uid else Identifiable.randomUID("rfc") - new RandomForestClassificationModel(uid, newTrees, numClasses) + new RandomForestClassificationModel(uid, newTrees, -1, numClasses) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index dc192add6ca1..47a18cdb31b5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -18,8 +18,8 @@ package org.apache.spark.ml.clustering import org.apache.spark.annotation.Experimental -import org.apache.spark.ml.param.{Param, Params, IntParam, DoubleParam, ParamMap} -import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasMaxIter, HasPredictionCol, HasSeed} +import org.apache.spark.ml.param.{Param, Params, IntParam, ParamMap} +import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.mllib.clustering.{KMeans => MLlibKMeans, KMeansModel => MLlibKMeansModel} @@ -27,14 +27,13 @@ import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{IntegerType, StructType} import org.apache.spark.sql.{DataFrame, Row} -import org.apache.spark.util.Utils /** * Common params for KMeans and KMeansModel */ -private[clustering] trait KMeansParams - extends Params with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol { +private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFeaturesCol + with HasSeed with HasPredictionCol with HasTol { /** * Set the number of clusters to create (k). Must be > 1. Default: 2. @@ -45,31 +44,6 @@ private[clustering] trait KMeansParams /** @group getParam */ def getK: Int = $(k) - /** - * Param the number of runs of the algorithm to execute in parallel. We initialize the algorithm - * this many times with random starting conditions (configured by the initialization mode), then - * return the best clustering found over any run. Must be >= 1. Default: 1. - * @group param - */ - final val runs = new IntParam(this, "runs", - "number of runs of the algorithm to execute in parallel", (value: Int) => value >= 1) - - /** @group getParam */ - def getRuns: Int = $(runs) - - /** - * Param the distance threshold within which we've consider centers to have converged. - * If all centers move less than this Euclidean distance, we stop iterating one run. - * Must be >= 0.0. Default: 1e-4 - * @group param - */ - final val epsilon = new DoubleParam(this, "epsilon", - "distance threshold within which we've consider centers to have converge", - (value: Double) => value >= 0.0) - - /** @group getParam */ - def getEpsilon: Double = $(epsilon) - /** * Param for the initialization algorithm. This can be either "random" to choose random points as * initial cluster centers, or "k-means||" to use a parallel variant of k-means++ @@ -136,9 +110,9 @@ class KMeansModel private[ml] ( /** * :: Experimental :: - * K-means clustering with support for multiple parallel runs and a k-means++ like initialization - * mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested, - * they are executed together with joint passes over the data for efficiency. + * K-means clustering with support for k-means|| initialization proposed by Bahmani et al. + * + * @see [[http://dx.doi.org/10.14778/2180912.2180915 Bahmani et al., Scalable k-means++.]] */ @Experimental class KMeans(override val uid: String) extends Estimator[KMeansModel] with KMeansParams { @@ -146,10 +120,9 @@ class KMeans(override val uid: String) extends Estimator[KMeansModel] with KMean setDefault( k -> 2, maxIter -> 20, - runs -> 1, initMode -> MLlibKMeans.K_MEANS_PARALLEL, initSteps -> 5, - epsilon -> 1e-4) + tol -> 1e-4) override def copy(extra: ParamMap): KMeans = defaultCopy(extra) @@ -174,10 +147,7 @@ class KMeans(override val uid: String) extends Estimator[KMeansModel] with KMean def setMaxIter(value: Int): this.type = set(maxIter, value) /** @group setParam */ - def setRuns(value: Int): this.type = set(runs, value) - - /** @group setParam */ - def setEpsilon(value: Double): this.type = set(epsilon, value) + def setTol(value: Double): this.type = set(tol, value) /** @group setParam */ def setSeed(value: Long): this.type = set(seed, value) @@ -191,8 +161,7 @@ class KMeans(override val uid: String) extends Estimator[KMeansModel] with KMean .setInitializationSteps($(initSteps)) .setMaxIterations($(maxIter)) .setSeed($(seed)) - .setEpsilon($(epsilon)) - .setRuns($(runs)) + .setEpsilon($(tol)) val parentModel = algo.run(rdd) val model = new KMeansModel(uid, parentModel) copyValues(model) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala index 4a82b77f0edc..08df2919a8a8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.types.DoubleType /** * :: Experimental :: - * Evaluator for binary classification, which expects two input columns: score and label. + * Evaluator for binary classification, which expects two input columns: rawPrediction and label. */ @Experimental class BinaryClassificationEvaluator(override val uid: String) @@ -38,10 +38,14 @@ class BinaryClassificationEvaluator(override val uid: String) /** * param for metric name in evaluation + * Default: areaUnderROC * @group param */ - val metricName: Param[String] = new Param(this, "metricName", - "metric name in evaluation (areaUnderROC|areaUnderPR)") + val metricName: Param[String] = { + val allowedParams = ParamValidators.inArray(Array("areaUnderROC", "areaUnderPR")) + new Param( + this, "metricName", "metric name in evaluation (areaUnderROC|areaUnderPR)", allowedParams) + } /** @group getParam */ def getMetricName: String = $(metricName) @@ -50,6 +54,13 @@ class BinaryClassificationEvaluator(override val uid: String) def setMetricName(value: String): this.type = set(metricName, value) /** @group setParam */ + def setRawPredictionCol(value: String): this.type = set(rawPredictionCol, value) + + /** + * @group setParam + * @deprecated use [[setRawPredictionCol()]] instead + */ + @deprecated("use setRawPredictionCol instead", "1.5.0") def setScoreCol(value: String): this.type = set(rawPredictionCol, value) /** @group setParam */ @@ -69,16 +80,17 @@ class BinaryClassificationEvaluator(override val uid: String) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { - case "areaUnderROC" => - metrics.areaUnderROC() - case "areaUnderPR" => - metrics.areaUnderPR() - case other => - throw new IllegalArgumentException(s"Does not support metric $other.") + case "areaUnderROC" => metrics.areaUnderROC() + case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } + override def isLargerBetter: Boolean = $(metricName) match { + case "areaUnderROC" => true + case "areaUnderPR" => true + } + override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala index e56c946a063e..13bd3307f8a2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala @@ -46,5 +46,12 @@ abstract class Evaluator extends Params { */ def evaluate(dataset: DataFrame): Double + /** + * Indicates whether the metric returned by [[evaluate()]] should be maximized (true, default) + * or minimized (false). + * A given evaluator may support multiple metrics which may be maximized or minimized. + */ + def isLargerBetter: Boolean = true + override def copy(extra: ParamMap): Evaluator } diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala index 44f779c1908d..f73d2345078e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala @@ -81,5 +81,13 @@ class MulticlassClassificationEvaluator (override val uid: String) metric } + override def isLargerBetter: Boolean = $(metricName) match { + case "f1" => true + case "precision" => true + case "recall" => true + case "weightedPrecision" => true + case "weightedRecall" => true + } + override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala index 01c000b47514..d21c88ab9b10 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala @@ -73,17 +73,20 @@ final class RegressionEvaluator(override val uid: String) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { - case "rmse" => - -metrics.rootMeanSquaredError - case "mse" => - -metrics.meanSquaredError - case "r2" => - metrics.r2 - case "mae" => - -metrics.meanAbsoluteError + case "rmse" => metrics.rootMeanSquaredError + case "mse" => metrics.meanSquaredError + case "r2" => metrics.r2 + case "mae" => metrics.meanAbsoluteError } metric } + override def isLargerBetter: Boolean = $(metricName) match { + case "rmse" => false + case "mse" => false + case "r2" => true + case "mae" => false + } + override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index 46314854d5e3..edad75443645 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -41,6 +41,7 @@ final class Binarizer(override val uid: String) * Param for threshold used to binarize continuous features. * The features greater than the threshold, will be binarized to 1.0. * The features equal to or less than the threshold, will be binarized to 0.0. + * Default: 0.0 * @group param */ val threshold: DoubleParam = diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 67e4785bc355..cfca494dcf46 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -90,7 +90,9 @@ final class Bucketizer(override val uid: String) SchemaUtils.appendColumn(schema, prepOutputField(schema)) } - override def copy(extra: ParamMap): Bucketizer = defaultCopy(extra) + override def copy(extra: ParamMap): Bucketizer = { + defaultCopy[Bucketizer](extra).setParent(parent) + } } private[feature] object Bucketizer { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala new file mode 100644 index 000000000000..49028e4b8506 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.feature + +import org.apache.spark.annotation.Experimental +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param._ +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.mllib.linalg.{VectorUDT, Vectors} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.DataFrame +import org.apache.spark.util.collection.OpenHashMap + +/** + * Params for [[CountVectorizer]] and [[CountVectorizerModel]]. + */ +private[feature] trait CountVectorizerParams extends Params with HasInputCol with HasOutputCol { + + /** + * Max size of the vocabulary. + * CountVectorizer will build a vocabulary that only considers the top + * vocabSize terms ordered by term frequency across the corpus. + * + * Default: 2^18^ + * @group param + */ + val vocabSize: IntParam = + new IntParam(this, "vocabSize", "max size of the vocabulary", ParamValidators.gt(0)) + + /** @group getParam */ + def getVocabSize: Int = $(vocabSize) + + /** + * Specifies the minimum number of different documents a term must appear in to be included + * in the vocabulary. + * If this is an integer >= 1, this specifies the number of documents the term must appear in; + * if this is a double in [0,1), then this specifies the fraction of documents. + * + * Default: 1 + * @group param + */ + val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the minimum number of" + + " different documents a term must appear in to be included in the vocabulary." + + " If this is an integer >= 1, this specifies the number of documents the term must" + + " appear in; if this is a double in [0,1), then this specifies the fraction of documents.", + ParamValidators.gtEq(0.0)) + + /** @group getParam */ + def getMinDF: Double = $(minDF) + + /** Validates and transforms the input schema. */ + protected def validateAndTransformSchema(schema: StructType): StructType = { + SchemaUtils.checkColumnType(schema, $(inputCol), new ArrayType(StringType, true)) + SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT) + } + + /** + * Filter to ignore rare words in a document. For each document, terms with + * frequency/count less than the given threshold are ignored. + * If this is an integer >= 1, then this specifies a count (of times the term must appear + * in the document); + * if this is a double in [0,1), then this specifies a fraction (out of the document's token + * count). + * + * Note that the parameter is only used in transform of [[CountVectorizerModel]] and does not + * affect fitting. + * + * Default: 1 + * @group param + */ + val minTF: DoubleParam = new DoubleParam(this, "minTF", "Filter to ignore rare words in" + + " a document. For each document, terms with frequency/count less than the given threshold are" + + " ignored. If this is an integer >= 1, then this specifies a count (of times the term must" + + " appear in the document); if this is a double in [0,1), then this specifies a fraction (out" + + " of the document's token count). Note that the parameter is only used in transform of" + + " CountVectorizerModel and does not affect fitting.", ParamValidators.gtEq(0.0)) + + setDefault(minTF -> 1) + + /** @group getParam */ + def getMinTF: Double = $(minTF) +} + +/** + * :: Experimental :: + * Extracts a vocabulary from document collections and generates a [[CountVectorizerModel]]. + */ +@Experimental +class CountVectorizer(override val uid: String) + extends Estimator[CountVectorizerModel] with CountVectorizerParams { + + def this() = this(Identifiable.randomUID("cntVec")) + + /** @group setParam */ + def setInputCol(value: String): this.type = set(inputCol, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + /** @group setParam */ + def setVocabSize(value: Int): this.type = set(vocabSize, value) + + /** @group setParam */ + def setMinDF(value: Double): this.type = set(minDF, value) + + /** @group setParam */ + def setMinTF(value: Double): this.type = set(minTF, value) + + setDefault(vocabSize -> (1 << 18), minDF -> 1) + + override def fit(dataset: DataFrame): CountVectorizerModel = { + transformSchema(dataset.schema, logging = true) + val vocSize = $(vocabSize) + val input = dataset.select($(inputCol)).map(_.getAs[Seq[String]](0)) + val minDf = if ($(minDF) >= 1.0) { + $(minDF) + } else { + $(minDF) * input.cache().count() + } + val wordCounts: RDD[(String, Long)] = input.flatMap { case (tokens) => + val wc = new OpenHashMap[String, Long] + tokens.foreach { w => + wc.changeValue(w, 1L, _ + 1L) + } + wc.map { case (word, count) => (word, (count, 1)) } + }.reduceByKey { case ((wc1, df1), (wc2, df2)) => + (wc1 + wc2, df1 + df2) + }.filter { case (word, (wc, df)) => + df >= minDf + }.map { case (word, (count, dfCount)) => + (word, count) + }.cache() + val fullVocabSize = wordCounts.count() + val vocab: Array[String] = { + val tmpSortedWC: Array[(String, Long)] = if (fullVocabSize <= vocSize) { + // Use all terms + wordCounts.collect().sortBy(-_._2) + } else { + // Sort terms to select vocab + wordCounts.sortBy(_._2, ascending = false).take(vocSize) + } + tmpSortedWC.map(_._1) + } + + require(vocab.length > 0, "The vocabulary size should be > 0. Lower minDF as necessary.") + copyValues(new CountVectorizerModel(uid, vocab).setParent(this)) + } + + override def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema) + } + + override def copy(extra: ParamMap): CountVectorizer = defaultCopy(extra) +} + +/** + * :: Experimental :: + * Converts a text document to a sparse vector of token counts. + * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted. + */ +@Experimental +class CountVectorizerModel(override val uid: String, val vocabulary: Array[String]) + extends Model[CountVectorizerModel] with CountVectorizerParams { + + def this(vocabulary: Array[String]) = { + this(Identifiable.randomUID("cntVecModel"), vocabulary) + set(vocabSize, vocabulary.length) + } + + /** @group setParam */ + def setInputCol(value: String): this.type = set(inputCol, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + /** @group setParam */ + def setMinTF(value: Double): this.type = set(minTF, value) + + /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */ + private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None + + override def transform(dataset: DataFrame): DataFrame = { + if (broadcastDict.isEmpty) { + val dict = vocabulary.zipWithIndex.toMap + broadcastDict = Some(dataset.sqlContext.sparkContext.broadcast(dict)) + } + val dictBr = broadcastDict.get + val minTf = $(minTF) + val vectorizer = udf { (document: Seq[String]) => + val termCounts = new OpenHashMap[Int, Double] + var tokenCount = 0L + document.foreach { term => + dictBr.value.get(term) match { + case Some(index) => termCounts.changeValue(index, 1.0, _ + 1.0) + case None => // ignore terms not in the vocabulary + } + tokenCount += 1 + } + val effectiveMinTF = if (minTf >= 1.0) { + minTf + } else { + tokenCount * minTf + } + Vectors.sparse(dictBr.value.size, termCounts.filter(_._2 >= effectiveMinTF).toSeq) + } + dataset.withColumn($(outputCol), vectorizer(col($(inputCol)))) + } + + override def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema) + } + + override def copy(extra: ParamMap): CountVectorizerModel = { + val copied = new CountVectorizerModel(uid, vocabulary).setParent(parent) + copyValues(copied, extra) + } +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizerModel.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizerModel.scala deleted file mode 100644 index 6b77de89a033..000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizerModel.scala +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.ml.feature - -import scala.collection.mutable - -import org.apache.spark.annotation.Experimental -import org.apache.spark.ml.UnaryTransformer -import org.apache.spark.ml.param.{ParamMap, ParamValidators, IntParam} -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.mllib.linalg.{Vectors, VectorUDT, Vector} -import org.apache.spark.sql.types.{StringType, ArrayType, DataType} - -/** - * :: Experimental :: - * Converts a text document to a sparse vector of token counts. - * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted. - */ -@Experimental -class CountVectorizerModel (override val uid: String, val vocabulary: Array[String]) - extends UnaryTransformer[Seq[String], Vector, CountVectorizerModel] { - - def this(vocabulary: Array[String]) = - this(Identifiable.randomUID("cntVec"), vocabulary) - - /** - * Corpus-specific filter to ignore scarce words in a document. For each document, terms with - * frequency (count) less than the given threshold are ignored. - * Default: 1 - * @group param - */ - val minTermFreq: IntParam = new IntParam(this, "minTermFreq", - "minimum frequency (count) filter used to neglect scarce words (>= 1). For each document, " + - "terms with frequency less than the given threshold are ignored.", ParamValidators.gtEq(1)) - - /** @group setParam */ - def setMinTermFreq(value: Int): this.type = set(minTermFreq, value) - - /** @group getParam */ - def getMinTermFreq: Int = $(minTermFreq) - - setDefault(minTermFreq -> 1) - - override protected def createTransformFunc: Seq[String] => Vector = { - val dict = vocabulary.zipWithIndex.toMap - document => - val termCounts = mutable.HashMap.empty[Int, Double] - document.foreach { term => - dict.get(term) match { - case Some(index) => termCounts.put(index, termCounts.getOrElse(index, 0.0) + 1.0) - case None => // ignore terms not in the vocabulary - } - } - Vectors.sparse(dict.size, termCounts.filter(_._2 >= $(minTermFreq)).toSeq) - } - - override protected def validateInputType(inputType: DataType): Unit = { - require(inputType.sameType(ArrayType(StringType)), - s"Input type must be ArrayType(StringType) but got $inputType.") - } - - override protected def outputDataType: DataType = new VectorUDT() - - override def copy(extra: ParamMap): CountVectorizerModel = { - val copied = new CountVectorizerModel(uid, vocabulary) - copyValues(copied, extra) - } -} diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala index ecde80810580..4c36df75d8aa 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala @@ -35,6 +35,7 @@ private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol /** * The minimum of documents in which a term should appear. + * Default: 0 * @group param */ final val minDocFreq = new IntParam( @@ -114,6 +115,6 @@ class IDFModel private[ml] ( override def copy(extra: ParamMap): IDFModel = { val copied = new IDFModel(uid, idfModel) - copyValues(copied, extra) + copyValues(copied, extra).setParent(parent) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala index b30adf3df48d..1b494ec8b172 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala @@ -41,6 +41,9 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H val min: DoubleParam = new DoubleParam(this, "min", "lower bound of the output feature range") + /** @group getParam */ + def getMin: Double = $(min) + /** * upper bound after transformation, shared by all features * Default: 1.0 @@ -49,6 +52,9 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H val max: DoubleParam = new DoubleParam(this, "max", "upper bound of the output feature range") + /** @group getParam */ + def getMax: Double = $(max) + /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType @@ -115,6 +121,9 @@ class MinMaxScaler(override val uid: String) * :: Experimental :: * Model fitted by [[MinMaxScaler]]. * + * @param originalMin min value for each original column during fitting + * @param originalMax max value for each original column during fitting + * * TODO: The transformer does not yet set the metadata in the output column (SPARK-8529). */ @Experimental @@ -136,7 +145,6 @@ class MinMaxScalerModel private[ml] ( /** @group setParam */ def setMax(value: Double): this.type = set(max, value) - override def transform(dataset: DataFrame): DataFrame = { val originalRange = (originalMax.toBreeze - originalMin.toBreeze).toArray val minArray = originalMin.toArray @@ -165,6 +173,6 @@ class MinMaxScalerModel private[ml] ( override def copy(extra: ParamMap): MinMaxScalerModel = { val copied = new MinMaxScalerModel(uid, originalMin, originalMax) - copyValues(copied, extra) + copyValues(copied, extra).setParent(parent) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala index 2d3bb680cf30..539084704b65 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala @@ -125,6 +125,6 @@ class PCAModel private[ml] ( override def copy(extra: ParamMap): PCAModel = { val copied = new PCAModel(uid, pcaModel) - copyValues(copied, extra) + copyValues(copied, extra).setParent(parent) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index d1726917e451..a7fa50444209 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -19,27 +19,20 @@ package org.apache.spark.ml.feature import scala.collection.mutable import scala.collection.mutable.ArrayBuffer -import scala.util.parsing.combinator.RegexParsers import org.apache.spark.annotation.Experimental -import org.apache.spark.ml.{Estimator, Model, Transformer, Pipeline, PipelineModel, PipelineStage} +import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol} import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ /** * Base trait for [[RFormula]] and [[RFormulaModel]]. */ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { - /** @group getParam */ - def setFeaturesCol(value: String): this.type = set(featuresCol, value) - - /** @group getParam */ - def setLabelCol(value: String): this.type = set(labelCol, value) protected def hasLabelCol(schema: StructType): Boolean = { schema.map(_.name).contains($(labelCol)) @@ -49,8 +42,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { /** * :: Experimental :: * Implements the transforms required for fitting a dataset against an R model formula. Currently - * we support a limited subset of the R operators, including '~' and '+'. Also see the R formula - * docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html + * we support a limited subset of the R operators, including '.', '~', '+', and '-'. Also see the + * R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html */ @Experimental class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase { @@ -63,31 +56,32 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R */ val formula: Param[String] = new Param(this, "formula", "R model formula") - private var parsedFormula: Option[ParsedRFormula] = None - /** * Sets the formula to use for this transformer. Must be called before use. * @group setParam * @param value an R formula in string form (e.g. "y ~ x + z") */ - def setFormula(value: String): this.type = { - parsedFormula = Some(RFormulaParser.parse(value)) - set(formula, value) - this - } + def setFormula(value: String): this.type = set(formula, value) /** @group getParam */ def getFormula: String = $(formula) + /** @group setParam */ + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** @group setParam */ + def setLabelCol(value: String): this.type = set(labelCol, value) + /** Whether the formula specifies fitting an intercept. */ private[ml] def hasIntercept: Boolean = { - require(parsedFormula.isDefined, "Must call setFormula() first.") - parsedFormula.get.hasIntercept + require(isDefined(formula), "Formula must be defined first.") + RFormulaParser.parse($(formula)).hasIntercept } override def fit(dataset: DataFrame): RFormulaModel = { - require(parsedFormula.isDefined, "Must call setFormula() first.") - val resolvedFormula = parsedFormula.get.resolve(dataset.schema) + require(isDefined(formula), "Formula must be defined first.") + val parsedFormula = RFormulaParser.parse($(formula)) + val resolvedFormula = parsedFormula.resolve(dataset.schema) // StringType terms and terms representing interactions need to be encoded before assembly. // TODO(ekl) add support for feature interactions val encoderStages = ArrayBuffer[PipelineStage]() diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 72b545e5db3e..f6d0b0c0e9e7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -136,6 +136,6 @@ class StandardScalerModel private[ml] ( override def copy(extra: ParamMap): StandardScalerModel = { val copied = new StandardScalerModel(uid, scaler) - copyValues(copied, extra) + copyValues(copied, extra).setParent(parent) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 3cc41424460f..3a0b2faa2f82 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -19,12 +19,12 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.{BooleanParam, ParamMap, StringArrayParam} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} -import org.apache.spark.ml.param.{ParamMap, BooleanParam, Param} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.types.{StringType, StructField, ArrayType, StructType} import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType} /** * stop words list @@ -98,9 +98,10 @@ class StopWordsRemover(override val uid: String) /** * the stop words set to be filtered out + * Default: [[StopWords.English]] * @group param */ - val stopWords: Param[Array[String]] = new Param(this, "stopWords", "stop words") + val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") /** @group setParam */ def setStopWords(value: Array[String]): this.type = set(stopWords, value) @@ -110,6 +111,7 @@ class StopWordsRemover(override val uid: String) /** * whether to do a case sensitive comparison over the stop words + * Default: false * @group param */ val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive", diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index ebfa97253235..048b5e8b5105 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -24,10 +24,10 @@ import org.apache.spark.ml.attribute.{Attribute, NominalAttribute} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.Transformer -import org.apache.spark.ml.util.{Identifiable, MetadataUtils} +import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{DoubleType, NumericType, StringType, StructType} +import org.apache.spark.sql.types._ import org.apache.spark.util.collection.OpenHashMap /** @@ -58,6 +58,8 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha * If the input column is numeric, we cast it to string and index the string values. * The indices are in [0, numLabels), ordered by label frequencies. * So the most frequent label gets index 0. + * + * @see [[IndexToString]] for the inverse transformation */ @Experimental class StringIndexer(override val uid: String) extends Estimator[StringIndexerModel] @@ -91,14 +93,19 @@ class StringIndexer(override val uid: String) extends Estimator[StringIndexerMod /** * :: Experimental :: * Model fitted by [[StringIndexer]]. + * * NOTE: During transformation, if the input column does not exist, * [[StringIndexerModel.transform]] would return the input dataset unmodified. * This is a temporary fix for the case when target labels do not exist during prediction. + * + * @param labels Ordered list of labels, corresponding to indices to be assigned */ @Experimental -class StringIndexerModel private[ml] ( +class StringIndexerModel ( override val uid: String, - labels: Array[String]) extends Model[StringIndexerModel] with StringIndexerBase { + val labels: Array[String]) extends Model[StringIndexerModel] with StringIndexerBase { + + def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels) private val labelToIndex: OpenHashMap[String, Double] = { val n = labels.length @@ -150,36 +157,26 @@ class StringIndexerModel private[ml] ( override def copy(extra: ParamMap): StringIndexerModel = { val copied = new StringIndexerModel(uid, labels) - copyValues(copied, extra) - } - - /** - * Return a model to perform the inverse transformation. - * Note: By default we keep the original columns during this transformation, so the inverse - * should only be used on new columns such as predicted labels. - */ - def invert(inputCol: String, outputCol: String): StringIndexerInverse = { - new StringIndexerInverse() - .setInputCol(inputCol) - .setOutputCol(outputCol) - .setLabels(labels) + copyValues(copied, extra).setParent(parent) } } /** * :: Experimental :: - * Transform a provided column back to the original input types using either the metadata - * on the input column, or if provided using the labels supplied by the user. - * Note: By default we keep the original columns during this transformation, - * so the inverse should only be used on new columns such as predicted labels. + * A [[Transformer]] that maps a column of string indices back to a new column of corresponding + * string values using either the ML attributes of the input column, or if provided using the labels + * supplied by the user. + * All original columns are kept during transformation. + * + * @see [[StringIndexer]] for converting strings into indices */ @Experimental -class StringIndexerInverse private[ml] ( +class IndexToString private[ml] ( override val uid: String) extends Transformer with HasInputCol with HasOutputCol { def this() = - this(Identifiable.randomUID("strIdxInv")) + this(Identifiable.randomUID("idxToStr")) /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -197,8 +194,8 @@ class StringIndexerInverse private[ml] ( /** * Param for array of labels. - * Optional labels to be provided by the user, if not supplied column - * metadata is read for labels. + * Optional labels to be provided by the user. + * Default: Empty array, in which case column metadata is used for labels. * @group param */ final val labels: StringArrayParam = new StringArrayParam(this, "labels", @@ -223,8 +220,7 @@ class StringIndexerInverse private[ml] ( val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") - val attr = NominalAttribute.defaultAttr.withName($(outputCol)) - val outputFields = inputFields :+ attr.toStructField() + val outputFields = inputFields :+ StructField($(outputCol), StringType) StructType(outputFields) } @@ -239,7 +235,7 @@ class StringIndexerInverse private[ml] ( } val indexer = udf { index: Double => val idx = index.toInt - if (0 <= idx && idx < values.size) { + if (0 <= idx && idx < values.length) { values(idx) } else { throw new SparkException(s"Unseen index: $index ??") @@ -250,7 +246,7 @@ class StringIndexerInverse private[ml] ( indexer(dataset($(inputCol)).cast(DoubleType)).as(outputColName)) } - override def copy(extra: ParamMap): StringIndexerInverse = { + override def copy(extra: ParamMap): IndexToString = { defaultCopy(extra) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index c73bdccdef5f..fa9f8237e39b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -43,6 +43,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu * Must be >= 2. * * (default = 20) + * @group param */ val maxCategories = new IntParam(this, "maxCategories", "Threshold for the number of values a categorical feature can take (>= 2)." + @@ -405,6 +406,6 @@ class VectorIndexerModel private[ml] ( override def copy(extra: ParamMap): VectorIndexerModel = { val copied = new VectorIndexerModel(uid, numFeatures, categoryMaps) - copyValues(copied, extra) + copyValues(copied, extra).setParent(parent) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala new file mode 100644 index 000000000000..9ec6b3e1fa8e --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.spark.annotation.Experimental +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.attribute.{Attribute, AttributeGroup} +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.{IntArrayParam, ParamMap, StringArrayParam} +import org.apache.spark.ml.util.{Identifiable, MetadataUtils, SchemaUtils} +import org.apache.spark.mllib.linalg._ +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.StructType + +/** + * :: Experimental :: + * This class takes a feature vector and outputs a new feature vector with a subarray of the + * original features. + * + * The subset of features can be specified with either indices ([[setIndices()]]) + * or names ([[setNames()]]). At least one feature must be selected. Duplicate features + * are not allowed, so there can be no overlap between selected indices and names. + * + * The output vector will order features with the selected indices first (in the order given), + * followed by the selected names (in the order given). + */ +@Experimental +final class VectorSlicer(override val uid: String) + extends Transformer with HasInputCol with HasOutputCol { + + def this() = this(Identifiable.randomUID("vectorSlicer")) + + /** + * An array of indices to select features from a vector column. + * There can be no overlap with [[names]]. + * Default: Empty array + * @group param + */ + val indices = new IntArrayParam(this, "indices", + "An array of indices to select features from a vector column." + + " There can be no overlap with names.", VectorSlicer.validIndices) + + setDefault(indices -> Array.empty[Int]) + + /** @group getParam */ + def getIndices: Array[Int] = $(indices) + + /** @group setParam */ + def setIndices(value: Array[Int]): this.type = set(indices, value) + + /** + * An array of feature names to select features from a vector column. + * These names must be specified by ML [[org.apache.spark.ml.attribute.Attribute]]s. + * There can be no overlap with [[indices]]. + * Default: Empty Array + * @group param + */ + val names = new StringArrayParam(this, "names", + "An array of feature names to select features from a vector column." + + " There can be no overlap with indices.", VectorSlicer.validNames) + + setDefault(names -> Array.empty[String]) + + /** @group getParam */ + def getNames: Array[String] = $(names) + + /** @group setParam */ + def setNames(value: Array[String]): this.type = set(names, value) + + /** @group setParam */ + def setInputCol(value: String): this.type = set(inputCol, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + override def validateParams(): Unit = { + require($(indices).length > 0 || $(names).length > 0, + s"VectorSlicer requires that at least one feature be selected.") + } + + override def transform(dataset: DataFrame): DataFrame = { + // Validity checks + transformSchema(dataset.schema) + val inputAttr = AttributeGroup.fromStructField(dataset.schema($(inputCol))) + inputAttr.numAttributes.foreach { numFeatures => + val maxIndex = $(indices).max + require(maxIndex < numFeatures, + s"Selected feature index $maxIndex invalid for only $numFeatures input features.") + } + + // Prepare output attributes + val inds = getSelectedFeatureIndices(dataset.schema) + val selectedAttrs: Option[Array[Attribute]] = inputAttr.attributes.map { attrs => + inds.map(index => attrs(index)) + } + val outputAttr = selectedAttrs match { + case Some(attrs) => new AttributeGroup($(outputCol), attrs) + case None => new AttributeGroup($(outputCol), inds.length) + } + + // Select features + val slicer = udf { vec: Vector => + vec match { + case features: DenseVector => Vectors.dense(inds.map(features.apply)) + case features: SparseVector => features.slice(inds) + } + } + dataset.withColumn($(outputCol), + slicer(dataset($(inputCol))).as($(outputCol), outputAttr.toMetadata())) + } + + /** Get the feature indices in order: indices, names */ + private def getSelectedFeatureIndices(schema: StructType): Array[Int] = { + val nameFeatures = MetadataUtils.getFeatureIndicesFromNames(schema($(inputCol)), $(names)) + val indFeatures = $(indices) + val numDistinctFeatures = (nameFeatures ++ indFeatures).distinct.length + lazy val errMsg = "VectorSlicer requires indices and names to be disjoint" + + s" sets of features, but they overlap." + + s" indices: ${indFeatures.mkString("[", ",", "]")}." + + s" names: " + + nameFeatures.zip($(names)).map { case (i, n) => s"$i:$n" }.mkString("[", ",", "]") + require(nameFeatures.length + indFeatures.length == numDistinctFeatures, errMsg) + indFeatures ++ nameFeatures + } + + override def transformSchema(schema: StructType): StructType = { + SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) + + if (schema.fieldNames.contains($(outputCol))) { + throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") + } + val numFeaturesSelected = $(indices).length + $(names).length + val outputAttr = new AttributeGroup($(outputCol), numFeaturesSelected) + val outputFields = schema.fields :+ outputAttr.toStructField() + StructType(outputFields) + } + + override def copy(extra: ParamMap): VectorSlicer = defaultCopy(extra) +} + +private[feature] object VectorSlicer { + + /** Return true if given feature indices are valid */ + def validIndices(indices: Array[Int]): Boolean = { + if (indices.isEmpty) { + true + } else { + indices.length == indices.distinct.length && indices.forall(_ >= 0) + } + } + + /** Return true if given feature names are valid */ + def validNames(names: Array[String]): Boolean = { + names.forall(_.nonEmpty) && names.length == names.distinct.length + } +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 6ea659095630..9edab3af913c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -18,15 +18,17 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental +import org.apache.spark.SparkContext import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature -import org.apache.spark.mllib.linalg.{VectorUDT, Vectors} +import org.apache.spark.mllib.linalg.{VectorUDT, Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ +import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types._ /** @@ -37,6 +39,7 @@ private[feature] trait Word2VecBase extends Params /** * The dimension of the code that you want to transform from words. + * Default: 100 * @group param */ final val vectorSize = new IntParam( @@ -48,6 +51,7 @@ private[feature] trait Word2VecBase extends Params /** * Number of partitions for sentences of words. + * Default: 1 * @group param */ final val numPartitions = new IntParam( @@ -60,6 +64,7 @@ private[feature] trait Word2VecBase extends Params /** * The minimum number of times a token must appear to be included in the word2vec model's * vocabulary. + * Default: 5 * @group param */ final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " + @@ -146,6 +151,40 @@ class Word2VecModel private[ml] ( wordVectors: feature.Word2VecModel) extends Model[Word2VecModel] with Word2VecBase { + + /** + * Returns a dataframe with two fields, "word" and "vector", with "word" being a String and + * and the vector the DenseVector that it is mapped to. + */ + @transient lazy val getVectors: DataFrame = { + val sc = SparkContext.getOrCreate() + val sqlContext = SQLContext.getOrCreate(sc) + import sqlContext.implicits._ + val wordVec = wordVectors.getVectors.mapValues(vec => Vectors.dense(vec.map(_.toDouble))) + sc.parallelize(wordVec.toSeq).toDF("word", "vector") + } + + /** + * Find "num" number of words closest in similarity to the given word. + * Returns a dataframe with the words and the cosine similarities between the + * synonyms and the given word. + */ + def findSynonyms(word: String, num: Int): DataFrame = { + findSynonyms(wordVectors.transform(word), num) + } + + /** + * Find "num" number of words closest to similarity to the given vector representation + * of the word. Returns a dataframe with the words and the cosine similarities between the + * synonyms and the given word vector. + */ + def findSynonyms(word: Vector, num: Int): DataFrame = { + val sc = SparkContext.getOrCreate() + val sqlContext = SQLContext.getOrCreate(sc) + import sqlContext.implicits._ + sc.parallelize(wordVectors.findSynonyms(word, num)).toDF("word", "similarity") + } + /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -185,6 +224,6 @@ class Word2VecModel private[ml] ( override def copy(extra: ParamMap): Word2VecModel = { val copied = new Word2VecModel(uid, wordVectors) - copyValues(copied, extra) + copyValues(copied, extra).setParent(parent) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala new file mode 100644 index 000000000000..4571ab26800c --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml + +import org.apache.spark.ml.feature.{HashingTF, IDF, IDFModel, VectorAssembler} +import org.apache.spark.sql.DataFrame + +/** + * == Feature transformers == + * + * The `ml.feature` package provides common feature transformers that help convert raw data or + * features into more suitable forms for model fitting. + * Most feature transformers are implemented as [[Transformer]]s, which transform one [[DataFrame]] + * into another, e.g., [[HashingTF]]. + * Some feature transformers are implemented as [[Estimator]]s, because the transformation requires + * some aggregated information of the dataset, e.g., document frequencies in [[IDF]]. + * For those feature transformers, calling [[Estimator!.fit]] is required to obtain the model first, + * e.g., [[IDFModel]], in order to apply transformation. + * The transformation is usually done by appending new columns to the input [[DataFrame]], so all + * input columns are carried over. + * + * We try to make each transformer minimal, so it becomes flexible to assemble feature + * transformation pipelines. + * [[Pipeline]] can be used to chain feature transformers, and [[VectorAssembler]] can be used to + * combine multiple feature transformations, for example: + * + * {{{ + * import org.apache.spark.ml.feature._ + * import org.apache.spark.ml.Pipeline + * + * // a DataFrame with three columns: id (integer), text (string), and rating (double). + * val df = sqlContext.createDataFrame(Seq( + * (0, "Hi I heard about Spark", 3.0), + * (1, "I wish Java could use case classes", 4.0), + * (2, "Logistic regression models are neat", 4.0) + * )).toDF("id", "text", "rating") + * + * // define feature transformers + * val tok = new RegexTokenizer() + * .setInputCol("text") + * .setOutputCol("words") + * val sw = new StopWordsRemover() + * .setInputCol("words") + * .setOutputCol("filtered_words") + * val tf = new HashingTF() + * .setInputCol("filtered_words") + * .setOutputCol("tf") + * .setNumFeatures(10000) + * val idf = new IDF() + * .setInputCol("tf") + * .setOutputCol("tf_idf") + * val assembler = new VectorAssembler() + * .setInputCols(Array("tf_idf", "rating")) + * .setOutputCol("features") + * + * // assemble and fit the feature transformation pipeline + * val pipeline = new Pipeline() + * .setStages(Array(tok, sw, tf, idf, assembler)) + * val model = pipeline.fit(df) + * + * // save transformed features with raw data + * model.transform(df) + * .select("id", "text", "rating", "features") + * .write.format("parquet").save("/output/path") + * }}} + * + * Some feature transformers implemented in MLlib are inspired by those implemented in scikit-learn. + * The major difference is that most scikit-learn feature transformers operate eagerly on the entire + * input dataset, while MLlib's feature transformers operate lazily on individual columns, + * which is more efficient and flexible to handle large and complex datasets. + * + * @see [[http://scikit-learn.org/stable/modules/preprocessing.html scikit-learn.preprocessing]] + */ +package object feature diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index d68f5ff0053c..91c0a5631319 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -559,13 +559,26 @@ trait Params extends Identifiable with Serializable { /** * Copies param values from this instance to another instance for params shared by them. - * @param to the target instance - * @param extra extra params to be copied + * + * This handles default Params and explicitly set Params separately. + * Default Params are copied from and to [[defaultParamMap]], and explicitly set Params are + * copied from and to [[paramMap]]. + * Warning: This implicitly assumes that this [[Params]] instance and the target instance + * share the same set of default Params. + * + * @param to the target instance, which should work with the same set of default Params as this + * source instance + * @param extra extra params to be copied to the target's [[paramMap]] * @return the target instance with param values copied */ protected def copyValues[T <: Params](to: T, extra: ParamMap = ParamMap.empty): T = { - val map = extractParamMap(extra) + val map = paramMap ++ extra params.foreach { param => + // copy default Params + if (defaultParamMap.contains(param) && to.hasParam(param.name)) { + to.defaultParamMap.put(to.getParam(param.name), defaultParamMap(param)) + } + // copy explicitly set Params if (map.contains(param) && to.hasParam(param.name)) { to.set(param.name, map(param)) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index f7ae1de522e0..3899df6cbcf5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -45,8 +45,14 @@ private[shared] object SharedParamsCodeGen { " These probabilities should be treated as confidences, not precise probabilities.", Some("\"probability\"")), ParamDesc[Double]("threshold", - "threshold in binary classification prediction, in range [0, 1]", - isValid = "ParamValidators.inRange(0, 1)"), + "threshold in binary classification prediction, in range [0, 1]", Some("0.5"), + isValid = "ParamValidators.inRange(0, 1)", finalMethods = false), + ParamDesc[Array[Double]]("thresholds", "Thresholds in multi-class classification" + + " to adjust the probability of predicting each class." + + " Array must have length equal to the number of classes, with values >= 0." + + " The class with largest value p/t is predicted, where p is the original probability" + + " of that class and t is the class' threshold.", + isValid = "(t: Array[Double]) => t.forall(_ >= 0)", finalMethods = false), ParamDesc[String]("inputCol", "input column name"), ParamDesc[Array[String]]("inputCols", "input column names"), ParamDesc[String]("outputCol", "output column name", Some("uid + \"__output\"")), @@ -60,7 +66,9 @@ private[shared] object SharedParamsCodeGen { " For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", isValid = "ParamValidators.inRange(0, 1)"), ParamDesc[Double]("tol", "the convergence tolerance for iterative algorithms"), - ParamDesc[Double]("stepSize", "Step size to be used for each iteration of optimization.")) + ParamDesc[Double]("stepSize", "Step size to be used for each iteration of optimization."), + ParamDesc[String]("weightCol", "weight column name. If this is not set or empty, we treat " + + "all instance weights as 1.0.")) val code = genSharedParams(params) val file = "src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala" @@ -74,7 +82,8 @@ private[shared] object SharedParamsCodeGen { name: String, doc: String, defaultValueStr: Option[String] = None, - isValid: String = "") { + isValid: String = "", + finalMethods: Boolean = true) { require(name.matches("[a-z][a-zA-Z0-9]*"), s"Param name $name is invalid.") require(doc.nonEmpty) // TODO: more rigorous on doc @@ -88,6 +97,7 @@ private[shared] object SharedParamsCodeGen { case _ if c == classOf[Double] => "DoubleParam" case _ if c == classOf[Boolean] => "BooleanParam" case _ if c.isArray && c.getComponentType == classOf[String] => s"StringArrayParam" + case _ if c.isArray && c.getComponentType == classOf[Double] => s"DoubleArrayParam" case _ => s"Param[${getTypeString(c)}]" } } @@ -131,6 +141,11 @@ private[shared] object SharedParamsCodeGen { } else { "" } + val methodStr = if (param.finalMethods) { + "final def" + } else { + "def" + } s""" |/** @@ -145,7 +160,7 @@ private[shared] object SharedParamsCodeGen { | final val $name: $Param = new $Param(this, "$name", "$doc"$isValid) |$setDefault | /** @group getParam */ - | final def get$Name: $T = $$($name) + | $methodStr get$Name: $T = $$($name) |} |""".stripMargin } diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index 65e48e4ee508..e8e58aadeabb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -139,7 +139,7 @@ private[ml] trait HasProbabilityCol extends Params { } /** - * Trait for shared param threshold. + * Trait for shared param threshold (default: 0.5). */ private[ml] trait HasThreshold extends Params { @@ -149,8 +149,25 @@ private[ml] trait HasThreshold extends Params { */ final val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold in binary classification prediction, in range [0, 1]", ParamValidators.inRange(0, 1)) + setDefault(threshold, 0.5) + + /** @group getParam */ + def getThreshold: Double = $(threshold) +} + +/** + * Trait for shared param thresholds. + */ +private[ml] trait HasThresholds extends Params { + + /** + * Param for Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.. + * @group param + */ + final val thresholds: DoubleArrayParam = new DoubleArrayParam(this, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", (t: Array[Double]) => t.forall(_ >= 0)) + /** @group getParam */ - final def getThreshold: Double = $(threshold) + def getThresholds: Array[Double] = $(thresholds) } /** @@ -310,4 +327,19 @@ private[ml] trait HasStepSize extends Params { /** @group getParam */ final def getStepSize: Double = $(stepSize) } + +/** + * Trait for shared param weightCol. + */ +private[ml] trait HasWeightCol extends Params { + + /** + * Param for weight column name. If this is not set or empty, we treat all instance weights as 1.0.. + * @group param + */ + final val weightCol: Param[String] = new Param[String](this, "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.") + + /** @group getParam */ + final def getWeightCol: String = $(weightCol) +} // scalastyle:on diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 2e44cd4cc6a2..7db8ad8d2791 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -219,7 +219,7 @@ class ALSModel private[ml] ( override def copy(extra: ParamMap): ALSModel = { val copied = new ALSModel(uid, rank, userFactors, itemFactors) - copyValues(copied, extra) + copyValues(copied, extra).setParent(parent) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 4d30e4b5548a..a2bcd67401d0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -107,14 +107,14 @@ final class DecisionTreeRegressionModel private[ml] ( * Construct a decision tree regression model. * @param rootNode Root node of tree, with other nodes attached. */ - def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode) + private[ml] def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode) override protected def predict(features: Vector): Double = { rootNode.predictImpl(features).prediction } override def copy(extra: ParamMap): DecisionTreeRegressionModel = { - copyValues(new DecisionTreeRegressionModel(uid, rootNode), extra) + copyValues(new DecisionTreeRegressionModel(uid, rootNode), extra).setParent(parent) } override def toString: String = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index 5633bc320273..b66e61f37dd5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -185,7 +185,7 @@ final class GBTRegressionModel( } override def copy(extra: ParamMap): GBTRegressionModel = { - copyValues(new GBTRegressionModel(uid, _trees, _treeWeights), extra) + copyValues(new GBTRegressionModel(uid, _trees, _treeWeights), extra).setParent(parent) } override def toString: String = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala index 4ece8cf8cf0b..2ff500f291ab 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala @@ -17,44 +17,102 @@ package org.apache.spark.ml.regression +import org.apache.spark.Logging import org.apache.spark.annotation.Experimental -import org.apache.spark.ml.PredictorParams -import org.apache.spark.ml.param.{Param, ParamMap, BooleanParam} -import org.apache.spark.ml.util.{SchemaUtils, Identifiable} -import org.apache.spark.mllib.regression.{IsotonicRegression => MLlibIsotonicRegression} -import org.apache.spark.mllib.regression.{IsotonicRegressionModel => MLlibIsotonicRegressionModel} +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.param._ +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol, HasWeightCol} +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} +import org.apache.spark.mllib.regression.{IsotonicRegression => MLlibIsotonicRegression, IsotonicRegressionModel => MLlibIsotonicRegressionModel} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.types.{DoubleType, DataType} -import org.apache.spark.sql.{Row, DataFrame} +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions.{col, lit, udf} +import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.storage.StorageLevel /** * Params for isotonic regression. */ -private[regression] trait IsotonicRegressionParams extends PredictorParams { +private[regression] trait IsotonicRegressionBase extends Params with HasFeaturesCol + with HasLabelCol with HasPredictionCol with HasWeightCol with Logging { /** - * Param for weight column name. - * TODO: Move weightCol to sharedParams. - * + * Param for whether the output sequence should be isotonic/increasing (true) or + * antitonic/decreasing (false). + * Default: true * @group param */ - final val weightCol: Param[String] = - new Param[String](this, "weightCol", "weight column name") + final val isotonic: BooleanParam = + new BooleanParam(this, "isotonic", + "whether the output sequence should be isotonic/increasing (true) or" + + "antitonic/decreasing (false)") /** @group getParam */ - final def getWeightCol: String = $(weightCol) + final def getIsotonic: Boolean = $(isotonic) /** - * Param for isotonic parameter. - * Isotonic (increasing) or antitonic (decreasing) sequence. + * Param for the index of the feature if [[featuresCol]] is a vector column (default: `0`), no + * effect otherwise. * @group param */ - final val isotonic: BooleanParam = - new BooleanParam(this, "isotonic", "isotonic (increasing) or antitonic (decreasing) sequence") + final val featureIndex: IntParam = new IntParam(this, "featureIndex", + "The index of the feature if featuresCol is a vector column, no effect otherwise.") /** @group getParam */ - final def getIsotonicParam: Boolean = $(isotonic) + final def getFeatureIndex: Int = $(featureIndex) + + setDefault(isotonic -> true, featureIndex -> 0) + + /** Checks whether the input has weight column. */ + protected[ml] def hasWeightCol: Boolean = { + isDefined(weightCol) && $(weightCol) != "" + } + + /** + * Extracts (label, feature, weight) from input dataset. + */ + protected[ml] def extractWeightedLabeledPoints( + dataset: DataFrame): RDD[(Double, Double, Double)] = { + val f = if (dataset.schema($(featuresCol)).dataType.isInstanceOf[VectorUDT]) { + val idx = $(featureIndex) + val extract = udf { v: Vector => v(idx) } + extract(col($(featuresCol))) + } else { + col($(featuresCol)) + } + val w = if (hasWeightCol) { + col($(weightCol)) + } else { + lit(1.0) + } + dataset.select(col($(labelCol)), f, w) + .map { case Row(label: Double, feature: Double, weights: Double) => + (label, feature, weights) + } + } + + /** + * Validates and transforms input schema. + * @param schema input schema + * @param fitting whether this is in fitting or prediction + * @return output schema + */ + protected[ml] def validateAndTransformSchema( + schema: StructType, + fitting: Boolean): StructType = { + if (fitting) { + SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) + if (hasWeightCol) { + SchemaUtils.checkColumnType(schema, $(weightCol), DoubleType) + } else { + logInfo("The weight column is not defined. Treat all instance weights as 1.0.") + } + } + val featuresType = schema($(featuresCol)).dataType + require(featuresType == DoubleType || featuresType.isInstanceOf[VectorUDT]) + SchemaUtils.appendColumn(schema, $(predictionCol), DoubleType) + } } /** @@ -67,52 +125,46 @@ private[regression] trait IsotonicRegressionParams extends PredictorParams { * Uses [[org.apache.spark.mllib.regression.IsotonicRegression]]. */ @Experimental -class IsotonicRegression(override val uid: String) - extends Regressor[Double, IsotonicRegression, IsotonicRegressionModel] - with IsotonicRegressionParams { +class IsotonicRegression(override val uid: String) extends Estimator[IsotonicRegressionModel] + with IsotonicRegressionBase { def this() = this(Identifiable.randomUID("isoReg")) - /** - * Set the isotonic parameter. - * Default is true. - * @group setParam - */ - def setIsotonicParam(value: Boolean): this.type = set(isotonic, value) - setDefault(isotonic -> true) + /** @group setParam */ + def setLabelCol(value: String): this.type = set(labelCol, value) - /** - * Set weight column param. - * Default is weight. - * @group setParam - */ - def setWeightParam(value: String): this.type = set(weightCol, value) - setDefault(weightCol -> "weight") + /** @group setParam */ + def setFeaturesCol(value: String): this.type = set(featuresCol, value) - override private[ml] def featuresDataType: DataType = DoubleType + /** @group setParam */ + def setPredictionCol(value: String): this.type = set(predictionCol, value) - override def copy(extra: ParamMap): IsotonicRegression = defaultCopy(extra) + /** @group setParam */ + def setIsotonic(value: Boolean): this.type = set(isotonic, value) - private[this] def extractWeightedLabeledPoints( - dataset: DataFrame): RDD[(Double, Double, Double)] = { + /** @group setParam */ + def setWeightCol(value: String): this.type = set(weightCol, value) - dataset.select($(labelCol), $(featuresCol), $(weightCol)) - .map { case Row(label: Double, features: Double, weights: Double) => - (label, features, weights) - } - } + /** @group setParam */ + def setFeatureIndex(value: Int): this.type = set(featureIndex, value) + + override def copy(extra: ParamMap): IsotonicRegression = defaultCopy(extra) - override protected def train(dataset: DataFrame): IsotonicRegressionModel = { - SchemaUtils.checkColumnType(dataset.schema, $(weightCol), DoubleType) + override def fit(dataset: DataFrame): IsotonicRegressionModel = { + validateAndTransformSchema(dataset.schema, fitting = true) // Extract columns from data. If dataset is persisted, do not persist oldDataset. val instances = extractWeightedLabeledPoints(dataset) val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) val isotonicRegression = new MLlibIsotonicRegression().setIsotonic($(isotonic)) - val parentModel = isotonicRegression.run(instances) + val oldModel = isotonicRegression.run(instances) + + copyValues(new IsotonicRegressionModel(uid, oldModel).setParent(this)) + } - new IsotonicRegressionModel(uid, parentModel) + override def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema, fitting = true) } } @@ -123,22 +175,49 @@ class IsotonicRegression(override val uid: String) * * For detailed rules see [[org.apache.spark.mllib.regression.IsotonicRegressionModel.predict()]]. * - * @param parentModel A [[org.apache.spark.mllib.regression.IsotonicRegressionModel]] - * model trained by [[org.apache.spark.mllib.regression.IsotonicRegression]]. + * @param oldModel A [[org.apache.spark.mllib.regression.IsotonicRegressionModel]] + * model trained by [[org.apache.spark.mllib.regression.IsotonicRegression]]. */ +@Experimental class IsotonicRegressionModel private[ml] ( override val uid: String, - private[ml] val parentModel: MLlibIsotonicRegressionModel) - extends RegressionModel[Double, IsotonicRegressionModel] - with IsotonicRegressionParams { + private val oldModel: MLlibIsotonicRegressionModel) + extends Model[IsotonicRegressionModel] with IsotonicRegressionBase { - override def featuresDataType: DataType = DoubleType + /** @group setParam */ + def setFeaturesCol(value: String): this.type = set(featuresCol, value) - override protected def predict(features: Double): Double = { - parentModel.predict(features) - } + /** @group setParam */ + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + def setFeatureIndex(value: Int): this.type = set(featureIndex, value) + + /** Boundaries in increasing order for which predictions are known. */ + def boundaries: Vector = Vectors.dense(oldModel.boundaries) + + /** + * Predictions associated with the boundaries at the same index, monotone because of isotonic + * regression. + */ + def predictions: Vector = Vectors.dense(oldModel.predictions) override def copy(extra: ParamMap): IsotonicRegressionModel = { - copyValues(new IsotonicRegressionModel(uid, parentModel), extra) + copyValues(new IsotonicRegressionModel(uid, oldModel), extra).setParent(parent) + } + + override def transform(dataset: DataFrame): DataFrame = { + val predict = dataset.schema($(featuresCol)).dataType match { + case DoubleType => + udf { feature: Double => oldModel.predict(feature) } + case _: VectorUDT => + val idx = $(featureIndex) + udf { features: Vector => oldModel.predict(features(idx)) } + } + dataset.withColumn($(predictionCol), predict(col($(featuresCol)))) + } + + override def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema, fitting = false) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 3b85ba001b12..884003eb3852 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -45,7 +45,7 @@ import org.apache.spark.util.StatCounter */ private[regression] trait LinearRegressionParams extends PredictorParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol - with HasFitIntercept + with HasFitIntercept with HasStandardization /** * :: Experimental :: @@ -84,6 +84,18 @@ class LinearRegression(override val uid: String) def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) setDefault(fitIntercept -> true) + /** + * Whether to standardize the training features before fitting the model. + * The coefficients of models will be always returned on the original scale, + * so it will be transparent for users. Note that with/without standardization, + * the models should be always converged to the same solution when no regularization + * is applied. In R's GLMNET package, the default behavior is true as well. + * Default is true. + * @group setParam + */ + def setStandardization(value: Boolean): this.type = set(standardization, value) + setDefault(standardization -> true) + /** * Set the ElasticNet mixing parameter. * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. @@ -165,12 +177,24 @@ class LinearRegression(override val uid: String) val effectiveL2RegParam = (1.0 - $(elasticNetParam)) * effectiveRegParam val costFun = new LeastSquaresCostFun(instances, yStd, yMean, $(fitIntercept), - featuresStd, featuresMean, effectiveL2RegParam) + $(standardization), featuresStd, featuresMean, effectiveL2RegParam) val optimizer = if ($(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) { new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) } else { - new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, effectiveL1RegParam, $(tol)) + def effectiveL1RegFun = (index: Int) => { + if ($(standardization)) { + effectiveL1RegParam + } else { + // If `standardization` is false, we still standardize the data + // to improve the rate of convergence; as a result, we have to + // perform this reverse standardization by penalizing each component + // differently to get effectively the same objective function when + // the training dataset is not standardized. + if (featuresStd(index) != 0.0) effectiveL1RegParam / featuresStd(index) else 0.0 + } + } + new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, effectiveL1RegFun, $(tol)) } val initialWeights = Vectors.zeros(numFeatures) @@ -288,7 +312,7 @@ class LinearRegressionModel private[ml] ( override def copy(extra: ParamMap): LinearRegressionModel = { val newModel = copyValues(new LinearRegressionModel(uid, weights, intercept)) if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get) - newModel + newModel.setParent(parent) } } @@ -456,6 +480,7 @@ class LinearRegressionSummary private[regression] ( * @param weights The weights/coefficients corresponding to the features. * @param labelStd The standard deviation value of the label. * @param labelMean The mean value of the label. + * @param fitIntercept Whether to fit an intercept term. * @param featuresStd The standard deviation values of the features. * @param featuresMean The mean values of the features. */ @@ -568,6 +593,7 @@ private class LeastSquaresCostFun( labelStd: Double, labelMean: Double, fitIntercept: Boolean, + standardization: Boolean, featuresStd: Array[Double], featuresMean: Array[Double], effectiveL2regParam: Double) extends DiffFunction[BDV[Double]] { @@ -584,14 +610,38 @@ private class LeastSquaresCostFun( case (aggregator1, aggregator2) => aggregator1.merge(aggregator2) }) - // regVal is the sum of weight squares for L2 regularization - val norm = brzNorm(weights, 2.0) - val regVal = 0.5 * effectiveL2regParam * norm * norm + val totalGradientArray = leastSquaresAggregator.gradient.toArray - val loss = leastSquaresAggregator.loss + regVal - val gradient = leastSquaresAggregator.gradient - axpy(effectiveL2regParam, w, gradient) + val regVal = if (effectiveL2regParam == 0.0) { + 0.0 + } else { + var sum = 0.0 + w.foreachActive { (index, value) => + // The following code will compute the loss of the regularization; also + // the gradient of the regularization, and add back to totalGradientArray. + sum += { + if (standardization) { + totalGradientArray(index) += effectiveL2regParam * value + value * value + } else { + if (featuresStd(index) != 0.0) { + // If `standardization` is false, we still standardize the data + // to improve the rate of convergence; as a result, we have to + // perform this reverse standardization by penalizing each component + // differently to get effectively the same objective function when + // the training dataset is not standardized. + val temp = value / (featuresStd(index) * featuresStd(index)) + totalGradientArray(index) += effectiveL2regParam * temp + value * temp + } else { + 0.0 + } + } + } + } + 0.5 * effectiveL2regParam * sum + } - (loss, gradient.toBreeze.asInstanceOf[BDV[Double]]) + (leastSquaresAggregator.loss + regVal, new BDV(totalGradientArray)) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index 17fb1ad5e15d..2f36da371f57 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -30,7 +30,7 @@ import org.apache.spark.mllib.tree.model.{RandomForestModel => OldRandomForestMo import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.DoubleType + /** * :: Experimental :: @@ -87,7 +87,8 @@ final class RandomForestRegressor(override val uid: String) val trees = RandomForest.run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed) .map(_.asInstanceOf[DecisionTreeRegressionModel]) - new RandomForestRegressionModel(trees) + val numFeatures = oldDataset.first().features.size + new RandomForestRegressionModel(trees, numFeatures) } override def copy(extra: ParamMap): RandomForestRegressor = defaultCopy(extra) @@ -108,11 +109,13 @@ object RandomForestRegressor { * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] model for regression. * It supports both continuous and categorical features. * @param _trees Decision trees in the ensemble. + * @param numFeatures Number of features used by this model */ @Experimental final class RandomForestRegressionModel private[ml] ( override val uid: String, - private val _trees: Array[DecisionTreeRegressionModel]) + private val _trees: Array[DecisionTreeRegressionModel], + val numFeatures: Int) extends PredictionModel[Vector, RandomForestRegressionModel] with TreeEnsembleModel with Serializable { @@ -122,7 +125,8 @@ final class RandomForestRegressionModel private[ml] ( * Construct a random forest regression model, with all trees weighted equally. * @param trees Component trees */ - def this(trees: Array[DecisionTreeRegressionModel]) = this(Identifiable.randomUID("rfr"), trees) + private[ml] def this(trees: Array[DecisionTreeRegressionModel], numFeatures: Int) = + this(Identifiable.randomUID("rfr"), trees, numFeatures) override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]] @@ -147,13 +151,30 @@ final class RandomForestRegressionModel private[ml] ( } override def copy(extra: ParamMap): RandomForestRegressionModel = { - copyValues(new RandomForestRegressionModel(uid, _trees), extra) + copyValues(new RandomForestRegressionModel(uid, _trees, numFeatures), extra).setParent(parent) } override def toString: String = { s"RandomForestRegressionModel with $numTrees trees" } + /** + * Estimate of the importance of each feature. + * + * This generalizes the idea of "Gini" importance to other losses, + * following the explanation of Gini importance from "Random Forests" documentation + * by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn. + * + * This feature importance is calculated as follows: + * - Average over trees: + * - importance(feature j) = sum (over nodes which split on feature j) of the gain, + * where gain is scaled by the number of instances passing through node + * - Normalize importances for tree based on total number of training instances used + * to build tree. + * - Normalize feature importance vector to sum to 1. + */ + lazy val featureImportances: Vector = RandomForest.featureImportances(trees, numFeatures) + /** (private[ml]) Convert to a model in the old API */ private[ml] def toOld: OldRandomForestModel = { new OldRandomForestModel(OldAlgo.Regression, _trees.map(_.toOld)) @@ -173,6 +194,6 @@ private[ml] object RandomForestRegressionModel { // parent for each tree is null since there is no good way to set this. DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures) } - new RandomForestRegressionModel(parent.uid, newTrees) + new RandomForestRegressionModel(parent.uid, newTrees, -1) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala index 8879352a600a..cd2493129390 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala @@ -44,7 +44,7 @@ sealed abstract class Node extends Serializable { * and probabilities. * For classification, the array of class counts must be normalized to a probability distribution. */ - private[tree] def impurityStats: ImpurityCalculator + private[ml] def impurityStats: ImpurityCalculator /** Recursive prediction helper method */ private[ml] def predictImpl(features: Vector): LeafNode @@ -72,6 +72,12 @@ sealed abstract class Node extends Serializable { * @param id Node ID using old format IDs */ private[ml] def toOld(id: Int): OldNode + + /** + * Trace down the tree, and return the largest feature index used in any split. + * @return Max feature index used in a split, or -1 if there are no splits (single leaf node). + */ + private[ml] def maxSplitFeatureIndex(): Int } private[ml] object Node { @@ -109,7 +115,7 @@ private[ml] object Node { final class LeafNode private[ml] ( override val prediction: Double, override val impurity: Double, - override val impurityStats: ImpurityCalculator) extends Node { + override private[ml] val impurityStats: ImpurityCalculator) extends Node { override def toString: String = s"LeafNode(prediction = $prediction, impurity = $impurity)" @@ -129,6 +135,8 @@ final class LeafNode private[ml] ( new OldNode(id, new OldPredict(prediction, prob = impurityStats.prob(prediction)), impurity, isLeaf = true, None, None, None, None) } + + override private[ml] def maxSplitFeatureIndex(): Int = -1 } /** @@ -150,7 +158,7 @@ final class InternalNode private[ml] ( val leftChild: Node, val rightChild: Node, val split: Split, - override val impurityStats: ImpurityCalculator) extends Node { + override private[ml] val impurityStats: ImpurityCalculator) extends Node { override def toString: String = { s"InternalNode(prediction = $prediction, impurity = $impurity, split = $split)" @@ -190,6 +198,11 @@ final class InternalNode private[ml] ( new OldPredict(leftChild.prediction, prob = 0.0), new OldPredict(rightChild.prediction, prob = 0.0)))) } + + override private[ml] def maxSplitFeatureIndex(): Int = { + math.max(split.featureIndex, + math.max(leftChild.maxSplitFeatureIndex(), rightChild.maxSplitFeatureIndex())) + } } private object InternalNode { diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala index a8b90d9d266a..4ac51a475474 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala @@ -26,6 +26,7 @@ import org.apache.spark.Logging import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.ml.regression.DecisionTreeRegressionModel import org.apache.spark.ml.tree._ +import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, Strategy => OldStrategy} import org.apache.spark.mllib.tree.impl.{BaggedPoint, DTStatsAggregator, DecisionTreeMetadata, @@ -34,6 +35,7 @@ import org.apache.spark.mllib.tree.impurity.ImpurityCalculator import org.apache.spark.mllib.tree.model.ImpurityStats import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel +import org.apache.spark.util.collection.OpenHashMap import org.apache.spark.util.random.{SamplingUtils, XORShiftRandom} @@ -1113,4 +1115,94 @@ private[ml] object RandomForest extends Logging { } } + /** + * Given a Random Forest model, compute the importance of each feature. + * This generalizes the idea of "Gini" importance to other losses, + * following the explanation of Gini importance from "Random Forests" documentation + * by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn. + * + * This feature importance is calculated as follows: + * - Average over trees: + * - importance(feature j) = sum (over nodes which split on feature j) of the gain, + * where gain is scaled by the number of instances passing through node + * - Normalize importances for tree based on total number of training instances used + * to build tree. + * - Normalize feature importance vector to sum to 1. + * + * Note: This should not be used with Gradient-Boosted Trees. It only makes sense for + * independently trained trees. + * @param trees Unweighted forest of trees + * @param numFeatures Number of features in model (even if not all are explicitly used by + * the model). + * If -1, then numFeatures is set based on the max feature index in all trees. + * @return Feature importance values, of length numFeatures. + */ + private[ml] def featureImportances(trees: Array[DecisionTreeModel], numFeatures: Int): Vector = { + val totalImportances = new OpenHashMap[Int, Double]() + trees.foreach { tree => + // Aggregate feature importance vector for this tree + val importances = new OpenHashMap[Int, Double]() + computeFeatureImportance(tree.rootNode, importances) + // Normalize importance vector for this tree, and add it to total. + // TODO: In the future, also support normalizing by tree.rootNode.impurityStats.count? + val treeNorm = importances.map(_._2).sum + if (treeNorm != 0) { + importances.foreach { case (idx, impt) => + val normImpt = impt / treeNorm + totalImportances.changeValue(idx, normImpt, _ + normImpt) + } + } + } + // Normalize importances + normalizeMapValues(totalImportances) + // Construct vector + val d = if (numFeatures != -1) { + numFeatures + } else { + // Find max feature index used in trees + val maxFeatureIndex = trees.map(_.maxSplitFeatureIndex()).max + maxFeatureIndex + 1 + } + if (d == 0) { + assert(totalImportances.size == 0, s"Unknown error in computing RandomForest feature" + + s" importance: No splits in forest, but some non-zero importances.") + } + val (indices, values) = totalImportances.iterator.toSeq.sortBy(_._1).unzip + Vectors.sparse(d, indices.toArray, values.toArray) + } + + /** + * Recursive method for computing feature importances for one tree. + * This walks down the tree, adding to the importance of 1 feature at each node. + * @param node Current node in recursion + * @param importances Aggregate feature importances, modified by this method + */ + private[impl] def computeFeatureImportance( + node: Node, + importances: OpenHashMap[Int, Double]): Unit = { + node match { + case n: InternalNode => + val feature = n.split.featureIndex + val scaledGain = n.gain * n.impurityStats.count + importances.changeValue(feature, scaledGain, _ + scaledGain) + computeFeatureImportance(n.leftChild, importances) + computeFeatureImportance(n.rightChild, importances) + case n: LeafNode => + // do nothing + } + } + + /** + * Normalize the values of this map to sum to 1, in place. + * If all values are 0, this method does nothing. + * @param map Map with non-negative values. + */ + private[impl] def normalizeMapValues(map: OpenHashMap[Int, Double]): Unit = { + val total = map.map(_._2).sum + if (total != 0) { + val keys = map.iterator.map(_._1).toArray + keys.foreach { key => map.changeValue(key, 0.0, _ / total) } + } + } + } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala index 22873909c33f..b77191156f68 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala @@ -53,6 +53,12 @@ private[ml] trait DecisionTreeModel { val header = toString + "\n" header + rootNode.subtreeToString(2) } + + /** + * Trace down the tree, and return the largest feature index used in any split. + * @return Max feature index used in a split, or -1 if there are no splits (single leaf node). + */ + private[ml] def maxSplitFeatureIndex(): Int = rootNode.maxSplitFeatureIndex() } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index a0c5238d966b..dbd8d31571d2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -17,9 +17,10 @@ package org.apache.spark.ml.tree +import org.apache.spark.ml.classification.ClassifierParams import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared.{HasMaxIter, HasSeed} +import org.apache.spark.ml.param.shared.{HasMaxIter, HasSeed, HasThresholds} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, BoostingStrategy => OldBoostingStrategy, Strategy => OldStrategy} import org.apache.spark.mllib.tree.impurity.{Entropy => OldEntropy, Gini => OldGini, Impurity => OldImpurity, Variance => OldVariance} import org.apache.spark.mllib.tree.loss.{Loss => OldLoss} @@ -162,7 +163,7 @@ private[ml] trait DecisionTreeParams extends PredictorParams { oldAlgo: OldAlgo.Algo, oldImpurity: OldImpurity, subsamplingRate: Double): OldStrategy = { - val strategy = OldStrategy.defaultStategy(oldAlgo) + val strategy = OldStrategy.defaultStrategy(oldAlgo) strategy.impurity = oldImpurity strategy.checkpointInterval = getCheckpointInterval strategy.maxBins = getMaxBins diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index f979319cc4b5..0679bfd0f3ff 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -100,7 +100,9 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM } f2jBLAS.dscal(numModels, 1.0 / $(numFolds), metrics, 1) logInfo(s"Average cross-validation metrics: ${metrics.toSeq}") - val (bestMetric, bestIndex) = metrics.zipWithIndex.maxBy(_._1) + val (bestMetric, bestIndex) = + if (eval.isLargerBetter) metrics.zipWithIndex.maxBy(_._1) + else metrics.zipWithIndex.minBy(_._1) logInfo(s"Best set of parameters:\n${epm(bestIndex)}") logInfo(s"Best cross-validation metric: $bestMetric.") val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]] @@ -160,6 +162,6 @@ class CrossValidatorModel private[ml] ( uid, bestModel.copy(extra).asInstanceOf[Model[_]], avgMetrics.clone()) - copyValues(copied, extra) + copyValues(copied, extra).setParent(parent) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala index c0edc730b6fd..73a14b831015 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala @@ -99,7 +99,9 @@ class TrainValidationSplit(override val uid: String) extends Estimator[TrainVali validationDataset.unpersist() logInfo(s"Train validation split metrics: ${metrics.toSeq}") - val (bestMetric, bestIndex) = metrics.zipWithIndex.maxBy(_._1) + val (bestMetric, bestIndex) = + if (eval.isLargerBetter) metrics.zipWithIndex.maxBy(_._1) + else metrics.zipWithIndex.minBy(_._1) logInfo(s"Best set of parameters:\n${epm(bestIndex)}") logInfo(s"Best train validation split metric: $bestMetric.") val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]] diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala index ddd34a54503a..bd213e7362e9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala @@ -19,11 +19,19 @@ package org.apache.spark.ml.util import java.util.UUID +import org.apache.spark.annotation.DeveloperApi + /** + * :: DeveloperApi :: + * * Trait for an object with an immutable unique ID that identifies itself and its derivatives. + * + * WARNING: There have not yet been final discussions on this API, so it may be broken in future + * releases. */ -private[spark] trait Identifiable { +@DeveloperApi +trait Identifiable { /** * An immutable unique ID for the object and its derivatives. @@ -33,7 +41,11 @@ private[spark] trait Identifiable { override def toString: String = uid } -private[spark] object Identifiable { +/** + * :: DeveloperApi :: + */ +@DeveloperApi +object Identifiable { /** * Returns a random UID that concatenates the given prefix, "_", and 12 random hex chars. diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala index 2a1db90f2ca2..fcb517b5f735 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala @@ -20,6 +20,7 @@ package org.apache.spark.ml.util import scala.collection.immutable.HashMap import org.apache.spark.ml.attribute._ +import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.types.StructField @@ -74,4 +75,20 @@ private[spark] object MetadataUtils { } } + /** + * Takes a Vector column and a list of feature names, and returns the corresponding list of + * feature indices in the column, in order. + * @param col Vector column which must have feature names specified via attributes + * @param names List of feature names + */ + def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { + require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + + s" to be Vector type, but it was type ${col.dataType} instead.") + val inputAttr = AttributeGroup.fromStructField(col) + names.map { name => + require(inputAttr.hasAttr(name), + s"getFeatureIndicesFromNames found no feature with name $name in column $col.") + inputAttr.getAttr(name).index.get + } + } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 6f080d32bbf4..f585aacd452e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -37,6 +37,7 @@ import org.apache.spark.mllib.evaluation.RankingMetrics import org.apache.spark.mllib.feature._ import org.apache.spark.mllib.fpm.{FPGrowth, FPGrowthModel} import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.linalg.distributed._ import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.random.{RandomRDDs => RG} import org.apache.spark.mllib.recommendation._ @@ -54,7 +55,7 @@ import org.apache.spark.mllib.tree.{DecisionTree, GradientBoostedTrees, RandomFo import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.util.LinearDataGenerator import org.apache.spark.rdd.RDD -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.util.Utils @@ -1096,6 +1097,81 @@ private[python] class PythonMLLibAPI extends Serializable { Statistics.kolmogorovSmirnovTest(data, distName, paramsSeq: _*) } + /** + * Wrapper around RowMatrix constructor. + */ + def createRowMatrix(rows: JavaRDD[Vector], numRows: Long, numCols: Int): RowMatrix = { + new RowMatrix(rows.rdd, numRows, numCols) + } + + /** + * Wrapper around IndexedRowMatrix constructor. + */ + def createIndexedRowMatrix(rows: DataFrame, numRows: Long, numCols: Int): IndexedRowMatrix = { + // We use DataFrames for serialization of IndexedRows from Python, + // so map each Row in the DataFrame back to an IndexedRow. + val indexedRows = rows.map { + case Row(index: Long, vector: Vector) => IndexedRow(index, vector) + } + new IndexedRowMatrix(indexedRows, numRows, numCols) + } + + /** + * Wrapper around CoordinateMatrix constructor. + */ + def createCoordinateMatrix(rows: DataFrame, numRows: Long, numCols: Long): CoordinateMatrix = { + // We use DataFrames for serialization of MatrixEntry entries from + // Python, so map each Row in the DataFrame back to a MatrixEntry. + val entries = rows.map { + case Row(i: Long, j: Long, value: Double) => MatrixEntry(i, j, value) + } + new CoordinateMatrix(entries, numRows, numCols) + } + + /** + * Wrapper around BlockMatrix constructor. + */ + def createBlockMatrix(blocks: DataFrame, rowsPerBlock: Int, colsPerBlock: Int, + numRows: Long, numCols: Long): BlockMatrix = { + // We use DataFrames for serialization of sub-matrix blocks from + // Python, so map each Row in the DataFrame back to a + // ((blockRowIndex, blockColIndex), sub-matrix) tuple. + val blockTuples = blocks.map { + case Row(Row(blockRowIndex: Long, blockColIndex: Long), subMatrix: Matrix) => + ((blockRowIndex.toInt, blockColIndex.toInt), subMatrix) + } + new BlockMatrix(blockTuples, rowsPerBlock, colsPerBlock, numRows, numCols) + } + + /** + * Return the rows of an IndexedRowMatrix. + */ + def getIndexedRows(indexedRowMatrix: IndexedRowMatrix): DataFrame = { + // We use DataFrames for serialization of IndexedRows to Python, + // so return a DataFrame. + val sqlContext = new SQLContext(indexedRowMatrix.rows.sparkContext) + sqlContext.createDataFrame(indexedRowMatrix.rows) + } + + /** + * Return the entries of a CoordinateMatrix. + */ + def getMatrixEntries(coordinateMatrix: CoordinateMatrix): DataFrame = { + // We use DataFrames for serialization of MatrixEntry entries to + // Python, so return a DataFrame. + val sqlContext = new SQLContext(coordinateMatrix.entries.sparkContext) + sqlContext.createDataFrame(coordinateMatrix.entries) + } + + /** + * Return the sub-matrix blocks of a BlockMatrix. + */ + def getMatrixBlocks(blockMatrix: BlockMatrix): DataFrame = { + // We use DataFrames for serialization of sub-matrix blocks to + // Python, so return a DataFrame. + val sqlContext = new SQLContext(blockMatrix.blocks.sparkContext) + sqlContext.createDataFrame(blockMatrix.blocks) + } } /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala index ba73024e3c04..85a413243b04 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.classification import org.json4s.{DefaultFormats, JValue} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD @@ -30,14 +30,15 @@ import org.apache.spark.rdd.RDD * belongs. The categories are represented by double values: 0.0, 1.0, 2.0, etc. */ @Experimental +@Since("0.8.0") trait ClassificationModel extends Serializable { /** * Predict values for the given data set using the model trained. * * @param testData RDD representing data points to be predicted * @return an RDD[Double] where each entry contains the corresponding prediction - * @since 0.8.0 */ + @Since("1.0.0") def predict(testData: RDD[Vector]): RDD[Double] /** @@ -45,16 +46,16 @@ trait ClassificationModel extends Serializable { * * @param testData array representing a single data point * @return predicted category from the trained model - * @since 0.8.0 */ + @Since("1.0.0") def predict(testData: Vector): Double /** * Predict values for examples stored in a JavaRDD. * @param testData JavaRDD representing data points to be predicted * @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction - * @since 0.8.0 */ + @Since("1.0.0") def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] = predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]] } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala index 268642ac6a2f..5ceff5b2259e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala @@ -18,7 +18,7 @@ package org.apache.spark.mllib.classification import org.apache.spark.SparkContext -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.classification.impl.GLMClassificationModel import org.apache.spark.mllib.linalg.BLAS.dot import org.apache.spark.mllib.linalg.{DenseVector, Vector} @@ -41,11 +41,12 @@ import org.apache.spark.rdd.RDD * Multinomial Logistic Regression. By default, it is binary logistic regression * so numClasses will be set to 2. */ -class LogisticRegressionModel ( - override val weights: Vector, - override val intercept: Double, - val numFeatures: Int, - val numClasses: Int) +@Since("0.8.0") +class LogisticRegressionModel @Since("1.3.0") ( + @Since("1.0.0") override val weights: Vector, + @Since("1.0.0") override val intercept: Double, + @Since("1.3.0") val numFeatures: Int, + @Since("1.3.0") val numClasses: Int) extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable with Saveable with PMMLExportable { @@ -75,6 +76,7 @@ class LogisticRegressionModel ( /** * Constructs a [[LogisticRegressionModel]] with weights and intercept for binary classification. */ + @Since("1.0.0") def this(weights: Vector, intercept: Double) = this(weights, intercept, weights.size, 2) private var threshold: Option[Double] = Some(0.5) @@ -85,8 +87,8 @@ class LogisticRegressionModel ( * in Binary Logistic Regression. An example with prediction score greater than or equal to * this threshold is identified as an positive, and negative otherwise. The default value is 0.5. * It is only used for binary classification. - * @since 1.0.0 */ + @Since("1.0.0") @Experimental def setThreshold(threshold: Double): this.type = { this.threshold = Some(threshold) @@ -97,8 +99,8 @@ class LogisticRegressionModel ( * :: Experimental :: * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions. * It is only used for binary classification. - * @since 1.3.0 */ + @Since("1.3.0") @Experimental def getThreshold: Option[Double] = threshold @@ -106,8 +108,8 @@ class LogisticRegressionModel ( * :: Experimental :: * Clears the threshold so that `predict` will output raw prediction scores. * It is only used for binary classification. - * @since 1.0.0 */ + @Since("1.0.0") @Experimental def clearThreshold(): this.type = { threshold = None @@ -158,9 +160,7 @@ class LogisticRegressionModel ( } } - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName, numFeatures, numClasses, weights, intercept, threshold) @@ -168,19 +168,15 @@ class LogisticRegressionModel ( override protected def formatVersion: String = "1.0" - /** - * @since 1.4.0 - */ override def toString: String = { s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.getOrElse("None")}" } } +@Since("1.3.0") object LogisticRegressionModel extends Loader[LogisticRegressionModel] { - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def load(sc: SparkContext, path: String): LogisticRegressionModel = { val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) // Hard-code class name string in case it changes in the future @@ -213,6 +209,7 @@ object LogisticRegressionModel extends Loader[LogisticRegressionModel] { * for k classes multi-label classification problem. * Using [[LogisticRegressionWithLBFGS]] is recommended over this. */ +@Since("0.8.0") class LogisticRegressionWithSGD private[mllib] ( private var stepSize: Double, private var numIterations: Int, @@ -222,6 +219,7 @@ class LogisticRegressionWithSGD private[mllib] ( private val gradient = new LogisticGradient() private val updater = new SquaredL2Updater() + @Since("0.8.0") override val optimizer = new GradientDescent(gradient, updater) .setStepSize(stepSize) .setNumIterations(numIterations) @@ -233,6 +231,7 @@ class LogisticRegressionWithSGD private[mllib] ( * Construct a LogisticRegression object with default parameters: {stepSize: 1.0, * numIterations: 100, regParm: 0.01, miniBatchFraction: 1.0}. */ + @Since("0.8.0") def this() = this(1.0, 100, 0.01, 1.0) override protected[mllib] def createModel(weights: Vector, intercept: Double) = { @@ -244,6 +243,7 @@ class LogisticRegressionWithSGD private[mllib] ( * Top-level methods for calling Logistic Regression using Stochastic Gradient Descent. * NOTE: Labels used in Logistic Regression should be {0, 1} */ +@Since("0.8.0") object LogisticRegressionWithSGD { // NOTE(shivaram): We use multiple train methods instead of default arguments to support // Java programs. @@ -261,8 +261,8 @@ object LogisticRegressionWithSGD { * @param miniBatchFraction Fraction of data to be used per iteration. * @param initialWeights Initial set of weights to be used. Array should be equal in size to * the number of features in the data. - * @since 1.0.0 */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -284,8 +284,8 @@ object LogisticRegressionWithSGD { * @param stepSize Step size to be used for each iteration of gradient descent. * @param miniBatchFraction Fraction of data to be used per iteration. - * @since 1.0.0 */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -306,8 +306,8 @@ object LogisticRegressionWithSGD { * @param numIterations Number of iterations of gradient descent to run. * @return a LogisticRegressionModel which has the weights and offset from training. - * @since 1.0.0 */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -324,8 +324,8 @@ object LogisticRegressionWithSGD { * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. * @return a LogisticRegressionModel which has the weights and offset from training. - * @since 1.0.0 */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int): LogisticRegressionModel = { @@ -339,11 +339,13 @@ object LogisticRegressionWithSGD { * NOTE: Labels used in Logistic Regression should be {0, 1, ..., k - 1} * for k classes multi-label classification problem. */ +@Since("1.1.0") class LogisticRegressionWithLBFGS extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable { this.setFeatureScaling(true) + @Since("1.1.0") override val optimizer = new LBFGS(new LogisticGradient, new SquaredL2Updater) override protected val validators = List(multiLabelValidator) @@ -361,8 +363,8 @@ class LogisticRegressionWithLBFGS * Set the number of possible outcomes for k classes classification problem in * Multinomial Logistic Regression. * By default, it is binary logistic regression so k will be set to 2. - * @since 1.3.0 */ + @Since("1.3.0") @Experimental def setNumClasses(numClasses: Int): this.type = { require(numClasses > 1) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 2df91c09421e..a956084ae06e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -25,6 +25,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.{Logging, SparkContext, SparkException} +import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.{Loader, Saveable} @@ -40,11 +41,12 @@ import org.apache.spark.sql.{DataFrame, SQLContext} * where D is number of features * @param modelType The type of NB model to fit can be "multinomial" or "bernoulli" */ +@Since("0.9.0") class NaiveBayesModel private[spark] ( - val labels: Array[Double], - val pi: Array[Double], - val theta: Array[Array[Double]], - val modelType: String) + @Since("1.0.0") val labels: Array[Double], + @Since("0.9.0") val pi: Array[Double], + @Since("0.9.0") val theta: Array[Array[Double]], + @Since("1.4.0") val modelType: String) extends ClassificationModel with Serializable with Saveable { import NaiveBayes.{Bernoulli, Multinomial, supportedModelTypes} @@ -82,6 +84,7 @@ class NaiveBayesModel private[spark] ( throw new UnknownError(s"Invalid modelType: $modelType.") } + @Since("1.0.0") override def predict(testData: RDD[Vector]): RDD[Double] = { val bcModel = testData.context.broadcast(this) testData.mapPartitions { iter => @@ -90,6 +93,7 @@ class NaiveBayesModel private[spark] ( } } + @Since("1.0.0") override def predict(testData: Vector): Double = { modelType match { case Multinomial => @@ -106,6 +110,7 @@ class NaiveBayesModel private[spark] ( * @return an RDD[Vector] where each entry contains the predicted posterior class probabilities, * in the same order as class labels */ + @Since("1.5.0") def predictProbabilities(testData: RDD[Vector]): RDD[Vector] = { val bcModel = testData.context.broadcast(this) testData.mapPartitions { iter => @@ -121,6 +126,7 @@ class NaiveBayesModel private[spark] ( * @return predicted posterior class probabilities from the trained model, * in the same order as class labels */ + @Since("1.5.0") def predictProbabilities(testData: Vector): Vector = { modelType match { case Multinomial => @@ -157,6 +163,7 @@ class NaiveBayesModel private[spark] ( new DenseVector(scaledProbs.map(_ / probSum)) } + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { val data = NaiveBayesModel.SaveLoadV2_0.Data(labels, pi, theta, modelType) NaiveBayesModel.SaveLoadV2_0.save(sc, path, data) @@ -165,6 +172,7 @@ class NaiveBayesModel private[spark] ( override protected def formatVersion: String = "2.0" } +@Since("1.3.0") object NaiveBayesModel extends Loader[NaiveBayesModel] { import org.apache.spark.mllib.util.Loader._ @@ -198,6 +206,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { dataRDD.write.parquet(dataPath(path)) } + @Since("1.3.0") def load(sc: SparkContext, path: String): NaiveBayesModel = { val sqlContext = new SQLContext(sc) // Load Parquet data. @@ -300,30 +309,35 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { * document classification. By making every vector a 0-1 vector, it can also be used as * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The input feature values must be nonnegative. */ - +@Since("0.9.0") class NaiveBayes private ( private var lambda: Double, private var modelType: String) extends Serializable with Logging { import NaiveBayes.{Bernoulli, Multinomial} + @Since("1.4.0") def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial) + @Since("0.9.0") def this() = this(1.0, NaiveBayes.Multinomial) /** Set the smoothing parameter. Default: 1.0. */ + @Since("0.9.0") def setLambda(lambda: Double): NaiveBayes = { this.lambda = lambda this } /** Get the smoothing parameter. */ + @Since("1.4.0") def getLambda: Double = lambda /** * Set the model type using a string (case-sensitive). * Supported options: "multinomial" (default) and "bernoulli". */ + @Since("1.4.0") def setModelType(modelType: String): NaiveBayes = { require(NaiveBayes.supportedModelTypes.contains(modelType), s"NaiveBayes was created with an unknown modelType: $modelType.") @@ -332,6 +346,7 @@ class NaiveBayes private ( } /** Get the model type. */ + @Since("1.4.0") def getModelType: String = this.modelType /** @@ -339,6 +354,7 @@ class NaiveBayes private ( * * @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. */ + @Since("0.9.0") def run(data: RDD[LabeledPoint]): NaiveBayesModel = { val requireNonnegativeValues: Vector => Unit = (v: Vector) => { val values = v match { @@ -422,6 +438,7 @@ class NaiveBayes private ( /** * Top-level methods for calling naive Bayes. */ +@Since("0.9.0") object NaiveBayes { /** String name for multinomial model type. */ @@ -444,8 +461,8 @@ object NaiveBayes { * * @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency * vector or a count vector. - * @since 0.9.0 */ + @Since("0.9.0") def train(input: RDD[LabeledPoint]): NaiveBayesModel = { new NaiveBayes().run(input) } @@ -460,8 +477,8 @@ object NaiveBayes { * @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency * vector or a count vector. * @param lambda The smoothing parameter - * @since 0.9.0 */ + @Since("0.9.0") def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = { new NaiveBayes(lambda, Multinomial).run(input) } @@ -483,8 +500,8 @@ object NaiveBayes { * * @param modelType The type of NB model to fit from the enumeration NaiveBayesModels, can be * multinomial or bernoulli - * @since 0.9.0 */ + @Since("1.4.0") def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = { require(supportedModelTypes.contains(modelType), s"NaiveBayes was created with an unknown modelType: $modelType.") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index 5b54feeb1046..896565cd90e8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -18,7 +18,7 @@ package org.apache.spark.mllib.classification import org.apache.spark.SparkContext -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.classification.impl.GLMClassificationModel import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.optimization._ @@ -33,9 +33,10 @@ import org.apache.spark.rdd.RDD * @param weights Weights computed for every feature. * @param intercept Intercept computed for this model. */ -class SVMModel ( - override val weights: Vector, - override val intercept: Double) +@Since("0.8.0") +class SVMModel @Since("1.1.0") ( + @Since("1.0.0") override val weights: Vector, + @Since("0.8.0") override val intercept: Double) extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable with Saveable with PMMLExportable { @@ -46,8 +47,8 @@ class SVMModel ( * Sets the threshold that separates positive predictions from negative predictions. An example * with prediction score greater than or equal to this threshold is identified as an positive, * and negative otherwise. The default value is 0.0. - * @since 1.3.0 */ + @Since("1.0.0") @Experimental def setThreshold(threshold: Double): this.type = { this.threshold = Some(threshold) @@ -57,16 +58,16 @@ class SVMModel ( /** * :: Experimental :: * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions. - * @since 1.3.0 */ + @Since("1.3.0") @Experimental def getThreshold: Option[Double] = threshold /** * :: Experimental :: * Clears the threshold so that `predict` will output raw prediction scores. - * @since 1.0.0 */ + @Since("1.0.0") @Experimental def clearThreshold(): this.type = { threshold = None @@ -84,9 +85,7 @@ class SVMModel ( } } - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName, numFeatures = weights.size, numClasses = 2, weights, intercept, threshold) @@ -94,19 +93,15 @@ class SVMModel ( override protected def formatVersion: String = "1.0" - /** - * @since 1.4.0 - */ override def toString: String = { s"${super.toString}, numClasses = 2, threshold = ${threshold.getOrElse("None")}" } } +@Since("1.3.0") object SVMModel extends Loader[SVMModel] { - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def load(sc: SparkContext, path: String): SVMModel = { val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) // Hard-code class name string in case it changes in the future @@ -138,6 +133,7 @@ object SVMModel extends Loader[SVMModel] { * regularization is used, which can be changed via [[SVMWithSGD.optimizer]]. * NOTE: Labels used in SVM should be {0, 1}. */ +@Since("0.8.0") class SVMWithSGD private ( private var stepSize: Double, private var numIterations: Int, @@ -147,6 +143,7 @@ class SVMWithSGD private ( private val gradient = new HingeGradient() private val updater = new SquaredL2Updater() + @Since("0.8.0") override val optimizer = new GradientDescent(gradient, updater) .setStepSize(stepSize) .setNumIterations(numIterations) @@ -158,6 +155,7 @@ class SVMWithSGD private ( * Construct a SVM object with default parameters: {stepSize: 1.0, numIterations: 100, * regParm: 0.01, miniBatchFraction: 1.0}. */ + @Since("0.8.0") def this() = this(1.0, 100, 0.01, 1.0) override protected def createModel(weights: Vector, intercept: Double) = { @@ -168,6 +166,7 @@ class SVMWithSGD private ( /** * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1}. */ +@Since("0.8.0") object SVMWithSGD { /** @@ -185,8 +184,8 @@ object SVMWithSGD { * @param miniBatchFraction Fraction of data to be used per iteration. * @param initialWeights Initial set of weights to be used. Array should be equal in size to * the number of features in the data. - * @since 0.8.0 */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -209,8 +208,8 @@ object SVMWithSGD { * @param stepSize Step size to be used for each iteration of gradient descent. * @param regParam Regularization parameter. * @param miniBatchFraction Fraction of data to be used per iteration. - * @since 0.8.0 */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -231,8 +230,8 @@ object SVMWithSGD { * @param regParam Regularization parameter. * @param numIterations Number of iterations of gradient descent to run. * @return a SVMModel which has the weights and offset from training. - * @since 0.8.0 */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -250,8 +249,8 @@ object SVMWithSGD { * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. * @return a SVMModel which has the weights and offset from training. - * @since 0.8.0 */ + @Since("0.8.0") def train(input: RDD[LabeledPoint], numIterations: Int): SVMModel = { train(input, numIterations, 1.0, 0.01, 1.0) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala index 7d33df3221fb..75630054d136 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.classification -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.StreamingLinearAlgorithm @@ -44,6 +44,7 @@ import org.apache.spark.mllib.regression.StreamingLinearAlgorithm * }}} */ @Experimental +@Since("1.3.0") class StreamingLogisticRegressionWithSGD private[mllib] ( private var stepSize: Double, private var numIterations: Int, @@ -58,6 +59,7 @@ class StreamingLogisticRegressionWithSGD private[mllib] ( * Initial weights must be set before using trainOn or predictOn * (see `StreamingLinearAlgorithm`) */ + @Since("1.3.0") def this() = this(0.1, 50, 1.0, 0.0) protected val algorithm = new LogisticRegressionWithSGD( @@ -66,30 +68,35 @@ class StreamingLogisticRegressionWithSGD private[mllib] ( protected var model: Option[LogisticRegressionModel] = None /** Set the step size for gradient descent. Default: 0.1. */ + @Since("1.3.0") def setStepSize(stepSize: Double): this.type = { this.algorithm.optimizer.setStepSize(stepSize) this } /** Set the number of iterations of gradient descent to run per update. Default: 50. */ + @Since("1.3.0") def setNumIterations(numIterations: Int): this.type = { this.algorithm.optimizer.setNumIterations(numIterations) this } /** Set the fraction of each batch to use for updates. Default: 1.0. */ + @Since("1.3.0") def setMiniBatchFraction(miniBatchFraction: Double): this.type = { this.algorithm.optimizer.setMiniBatchFraction(miniBatchFraction) this } /** Set the regularization parameter. Default: 0.0. */ + @Since("1.3.0") def setRegParam(regParam: Double): this.type = { this.algorithm.optimizer.setRegParam(regParam) this } /** Set the initial weights. Default: [0.0, 0.0]. */ + @Since("1.3.0") def setInitialWeights(initialWeights: Vector): this.type = { this.model = Some(algorithm.createModel(initialWeights, 0.0)) this diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala index e459367333d2..f82bd82c2037 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala @@ -21,7 +21,7 @@ import scala.collection.mutable.IndexedSeq import breeze.linalg.{diag, DenseMatrix => BreezeMatrix, DenseVector => BDV, Vector => BV} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, Matrices, Vector, Vectors} import org.apache.spark.mllib.stat.distribution.MultivariateGaussian @@ -53,6 +53,7 @@ import org.apache.spark.util.Utils * @param maxIterations The maximum number of iterations to perform */ @Experimental +@Since("1.3.0") class GaussianMixture private ( private var k: Int, private var convergenceTol: Double, @@ -63,6 +64,7 @@ class GaussianMixture private ( * Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01, * maxIterations: 100, seed: random}. */ + @Since("1.3.0") def this() = this(2, 0.01, 100, Utils.random.nextLong()) // number of samples per cluster to use when initializing Gaussians @@ -72,10 +74,12 @@ class GaussianMixture private ( // default random starting point private var initialModel: Option[GaussianMixtureModel] = None - /** Set the initial GMM starting point, bypassing the random initialization. - * You must call setK() prior to calling this method, and the condition - * (model.k == this.k) must be met; failure will result in an IllegalArgumentException + /** + * Set the initial GMM starting point, bypassing the random initialization. + * You must call setK() prior to calling this method, and the condition + * (model.k == this.k) must be met; failure will result in an IllegalArgumentException */ + @Since("1.3.0") def setInitialModel(model: GaussianMixtureModel): this.type = { if (model.k == k) { initialModel = Some(model) @@ -85,31 +89,47 @@ class GaussianMixture private ( this } - /** Return the user supplied initial GMM, if supplied */ + /** + * Return the user supplied initial GMM, if supplied + */ + @Since("1.3.0") def getInitialModel: Option[GaussianMixtureModel] = initialModel - /** Set the number of Gaussians in the mixture model. Default: 2 */ + /** + * Set the number of Gaussians in the mixture model. Default: 2 + */ + @Since("1.3.0") def setK(k: Int): this.type = { this.k = k this } - /** Return the number of Gaussians in the mixture model */ + /** + * Return the number of Gaussians in the mixture model + */ + @Since("1.3.0") def getK: Int = k - /** Set the maximum number of iterations to run. Default: 100 */ + /** + * Set the maximum number of iterations to run. Default: 100 + */ + @Since("1.3.0") def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this } - /** Return the maximum number of iterations to run */ + /** + * Return the maximum number of iterations to run + */ + @Since("1.3.0") def getMaxIterations: Int = maxIterations /** * Set the largest change in log-likelihood at which convergence is * considered to have occurred. */ + @Since("1.3.0") def setConvergenceTol(convergenceTol: Double): this.type = { this.convergenceTol = convergenceTol this @@ -119,18 +139,28 @@ class GaussianMixture private ( * Return the largest change in log-likelihood at which convergence is * considered to have occurred. */ + @Since("1.3.0") def getConvergenceTol: Double = convergenceTol - /** Set the random seed */ + /** + * Set the random seed + */ + @Since("1.3.0") def setSeed(seed: Long): this.type = { this.seed = seed this } - /** Return the random seed */ + /** + * Return the random seed + */ + @Since("1.3.0") def getSeed: Long = seed - /** Perform expectation maximization */ + /** + * Perform expectation maximization + */ + @Since("1.3.0") def run(data: RDD[Vector]): GaussianMixtureModel = { val sc = data.sparkContext @@ -140,9 +170,7 @@ class GaussianMixture private ( // Get length of the input vectors val d = breezeData.first().length - // Heuristic to distribute the computation of the [[MultivariateGaussian]]s, approximately when - // d > 25 except for when k is very small - val distributeGaussians = ((k - 1.0) / k) * d > 25 + val shouldDistributeGaussians = GaussianMixture.shouldDistributeGaussians(k, d) // Determine initial weights and corresponding Gaussians. // If the user supplied an initial GMM, we use those values, otherwise @@ -176,15 +204,15 @@ class GaussianMixture private ( // (often referred to as the "M" step in literature) val sumWeights = sums.weights.sum - if (distributeGaussians) { + if (shouldDistributeGaussians) { val numPartitions = math.min(k, 1024) val tuples = Seq.tabulate(k)(i => (sums.means(i), sums.sigmas(i), sums.weights(i))) val (ws, gs) = sc.parallelize(tuples, numPartitions).map { case (mean, sigma, weight) => updateWeightsAndGaussians(mean, sigma, weight, sumWeights) - }.collect.unzip - Array.copy(ws, 0, weights, 0, ws.length) - Array.copy(gs, 0, gaussians, 0, gs.length) + }.collect().unzip + Array.copy(ws.toArray, 0, weights, 0, ws.length) + Array.copy(gs.toArray, 0, gaussians, 0, gs.length) } else { var i = 0 while (i < k) { @@ -204,7 +232,10 @@ class GaussianMixture private ( new GaussianMixtureModel(weights, gaussians) } - /** Java-friendly version of [[run()]] */ + /** + * Java-friendly version of [[run()]] + */ + @Since("1.3.0") def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd) private def updateWeightsAndGaussians( @@ -239,6 +270,16 @@ class GaussianMixture private ( } } +private[clustering] object GaussianMixture { + /** + * Heuristic to distribute the computation of the [[MultivariateGaussian]]s, approximately when + * d > 25 except for when k is very small. + * @param k Number of topics + * @param d Number of features + */ + def shouldDistributeGaussians(k: Int, d: Int): Boolean = ((k - 1.0) / k) * d > 25 +} + // companion class to provide zero constructor for ExpectationSum private object ExpectationSum { def zero(k: Int, d: Int): ExpectationSum = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index cb807c803810..7f6163e04bf1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -24,7 +24,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Matrices, Matrix} import org.apache.spark.mllib.stat.distribution.MultivariateGaussian @@ -44,29 +44,49 @@ import org.apache.spark.sql.{SQLContext, Row} * @param gaussians Array of MultivariateGaussian where gaussians(i) represents * the Multivariate Gaussian (Normal) Distribution for Gaussian i */ +@Since("1.3.0") @Experimental -class GaussianMixtureModel( - val weights: Array[Double], - val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable { +class GaussianMixtureModel @Since("1.3.0") ( + @Since("1.3.0") val weights: Array[Double], + @Since("1.3.0") val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable { require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match") override protected def formatVersion = "1.0" + @Since("1.4.0") override def save(sc: SparkContext, path: String): Unit = { GaussianMixtureModel.SaveLoadV1_0.save(sc, path, weights, gaussians) } - /** Number of gaussians in mixture */ + /** + * Number of gaussians in mixture + */ + @Since("1.3.0") def k: Int = weights.length - /** Maps given points to their cluster indices. */ + /** + * Maps given points to their cluster indices. + */ + @Since("1.3.0") def predict(points: RDD[Vector]): RDD[Int] = { val responsibilityMatrix = predictSoft(points) responsibilityMatrix.map(r => r.indexOf(r.max)) } - /** Java-friendly version of [[predict()]] */ + /** + * Maps given point to its cluster index. + */ + @Since("1.5.0") + def predict(point: Vector): Int = { + val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k) + r.indexOf(r.max) + } + + /** + * Java-friendly version of [[predict()]] + */ + @Since("1.4.0") def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] @@ -74,6 +94,7 @@ class GaussianMixtureModel( * Given the input vectors, return the membership value of each vector * to all mixture components. */ + @Since("1.3.0") def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = { val sc = points.sparkContext val bcDists = sc.broadcast(gaussians) @@ -83,6 +104,14 @@ class GaussianMixtureModel( } } + /** + * Given the input vector, return the membership values to all mixture components. + */ + @Since("1.4.0") + def predictSoft(point: Vector): Array[Double] = { + computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k) + } + /** * Compute the partial assignments for each vector */ @@ -102,6 +131,7 @@ class GaussianMixtureModel( } } +@Since("1.4.0") @Experimental object GaussianMixtureModel extends Loader[GaussianMixtureModel] { @@ -148,10 +178,11 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] { (weight, new MultivariateGaussian(mu, sigma)) }.unzip - return new GaussianMixtureModel(weights.toArray, gaussians.toArray) + new GaussianMixtureModel(weights.toArray, gaussians.toArray) } } + @Since("1.4.0") override def load(sc: SparkContext, path: String) : GaussianMixtureModel = { val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) implicit val formats = DefaultFormats diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 0a65403f4ec9..7168aac32c99 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS.{axpy, scal} import org.apache.spark.mllib.util.MLUtils @@ -37,6 +37,7 @@ import org.apache.spark.util.random.XORShiftRandom * This is an iterative algorithm that will make multiple passes over the data, so any RDDs given * to it should be cached by the user. */ +@Since("0.8.0") class KMeans private ( private var k: Int, private var maxIterations: Int, @@ -50,14 +51,19 @@ class KMeans private ( * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1, * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}. */ + @Since("0.8.0") def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4, Utils.random.nextLong()) /** * Number of clusters to create (k). */ + @Since("1.4.0") def getK: Int = k - /** Set the number of clusters to create (k). Default: 2. */ + /** + * Set the number of clusters to create (k). Default: 2. + */ + @Since("0.8.0") def setK(k: Int): this.type = { this.k = k this @@ -66,9 +72,13 @@ class KMeans private ( /** * Maximum number of iterations to run. */ + @Since("1.4.0") def getMaxIterations: Int = maxIterations - /** Set maximum number of iterations to run. Default: 20. */ + /** + * Set maximum number of iterations to run. Default: 20. + */ + @Since("0.8.0") def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this @@ -77,6 +87,7 @@ class KMeans private ( /** * The initialization algorithm. This can be either "random" or "k-means||". */ + @Since("1.4.0") def getInitializationMode: String = initializationMode /** @@ -84,6 +95,7 @@ class KMeans private ( * initial cluster centers, or "k-means||" to use a parallel variant of k-means++ * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||. */ + @Since("0.8.0") def setInitializationMode(initializationMode: String): this.type = { KMeans.validateInitMode(initializationMode) this.initializationMode = initializationMode @@ -94,6 +106,7 @@ class KMeans private ( * :: Experimental :: * Number of runs of the algorithm to execute in parallel. */ + @Since("1.4.0") @Experimental def getRuns: Int = runs @@ -103,6 +116,7 @@ class KMeans private ( * this many times with random starting conditions (configured by the initialization mode), then * return the best clustering found over any run. Default: 1. */ + @Since("0.8.0") @Experimental def setRuns(runs: Int): this.type = { if (runs <= 0) { @@ -115,12 +129,14 @@ class KMeans private ( /** * Number of steps for the k-means|| initialization mode */ + @Since("1.4.0") def getInitializationSteps: Int = initializationSteps /** * Set the number of steps for the k-means|| initialization mode. This is an advanced * setting -- the default of 5 is almost always enough. Default: 5. */ + @Since("0.8.0") def setInitializationSteps(initializationSteps: Int): this.type = { if (initializationSteps <= 0) { throw new IllegalArgumentException("Number of initialization steps must be positive") @@ -132,12 +148,14 @@ class KMeans private ( /** * The distance threshold within which we've consider centers to have converged. */ + @Since("1.4.0") def getEpsilon: Double = epsilon /** * Set the distance threshold within which we've consider centers to have converged. * If all centers move less than this Euclidean distance, we stop iterating one run. */ + @Since("0.8.0") def setEpsilon(epsilon: Double): this.type = { this.epsilon = epsilon this @@ -146,9 +164,13 @@ class KMeans private ( /** * The random seed for cluster initialization. */ + @Since("1.4.0") def getSeed: Long = seed - /** Set the random seed for cluster initialization. */ + /** + * Set the random seed for cluster initialization. + */ + @Since("1.4.0") def setSeed(seed: Long): this.type = { this.seed = seed this @@ -163,6 +185,7 @@ class KMeans private ( * The condition model.k == this.k must be met, failure results * in an IllegalArgumentException. */ + @Since("1.4.0") def setInitialModel(model: KMeansModel): this.type = { require(model.k == k, "mismatched cluster count") initialModel = Some(model) @@ -173,6 +196,7 @@ class KMeans private ( * Train a K-means model on the given set of points; `data` should be cached for high * performance, because this is an iterative algorithm. */ + @Since("0.8.0") def run(data: RDD[Vector]): KMeansModel = { if (data.getStorageLevel == StorageLevel.NONE) { @@ -345,7 +369,7 @@ class KMeans private ( : Array[Array[VectorWithNorm]] = { // Initialize empty centers and point costs. val centers = Array.tabulate(runs)(r => ArrayBuffer.empty[VectorWithNorm]) - var costs = data.map(_ => Vectors.dense(Array.fill(runs)(Double.PositiveInfinity))).cache() + var costs = data.map(_ => Array.fill(runs)(Double.PositiveInfinity)) // Initialize each run's first center to a random point. val seed = new XORShiftRandom(this.seed).nextInt() @@ -370,21 +394,28 @@ class KMeans private ( val bcNewCenters = data.context.broadcast(newCenters) val preCosts = costs costs = data.zip(preCosts).map { case (point, cost) => - Vectors.dense( Array.tabulate(runs) { r => math.min(KMeans.pointCost(bcNewCenters.value(r), point), cost(r)) - }) - }.cache() + } + }.persist(StorageLevel.MEMORY_AND_DISK) val sumCosts = costs - .aggregate(Vectors.zeros(runs))( + .aggregate(new Array[Double](runs))( seqOp = (s, v) => { // s += v - axpy(1.0, v, s) + var r = 0 + while (r < runs) { + s(r) += v(r) + r += 1 + } s }, combOp = (s0, s1) => { // s0 += s1 - axpy(1.0, s1, s0) + var r = 0 + while (r < runs) { + s0(r) += s1(r) + r += 1 + } s0 } ) @@ -431,10 +462,13 @@ class KMeans private ( /** * Top-level methods for calling K-means clustering. */ +@Since("0.8.0") object KMeans { // Initialization mode names + @Since("0.8.0") val RANDOM = "random" + @Since("0.8.0") val K_MEANS_PARALLEL = "k-means||" /** @@ -447,6 +481,7 @@ object KMeans { * @param initializationMode initialization model, either "random" or "k-means||" (default). * @param seed random seed value for cluster initialization */ + @Since("1.3.0") def train( data: RDD[Vector], k: Int, @@ -471,6 +506,7 @@ object KMeans { * @param runs number of parallel runs, defaults to 1. The best model is returned. * @param initializationMode initialization model, either "random" or "k-means||" (default). */ + @Since("0.8.0") def train( data: RDD[Vector], k: Int, @@ -487,6 +523,7 @@ object KMeans { /** * Trains a k-means model using specified parameters and the default values for unspecified. */ + @Since("0.8.0") def train( data: RDD[Vector], k: Int, @@ -497,6 +534,7 @@ object KMeans { /** * Trains a k-means model using specified parameters and the default values for unspecified. */ + @Since("0.8.0") def train( data: RDD[Vector], k: Int, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index 8ecb3df11d95..45021f4375ae 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala @@ -23,6 +23,7 @@ import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ +import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.pmml.PMMLExportable @@ -35,28 +36,44 @@ import org.apache.spark.sql.Row /** * A clustering model for K-means. Each point belongs to the cluster with the closest center. */ -class KMeansModel ( - val clusterCenters: Array[Vector]) extends Saveable with Serializable with PMMLExportable { +@Since("0.8.0") +class KMeansModel @Since("1.1.0") (@Since("1.0.0") val clusterCenters: Array[Vector]) + extends Saveable with Serializable with PMMLExportable { - /** A Java-friendly constructor that takes an Iterable of Vectors. */ + /** + * A Java-friendly constructor that takes an Iterable of Vectors. + */ + @Since("1.4.0") def this(centers: java.lang.Iterable[Vector]) = this(centers.asScala.toArray) - /** Total number of clusters. */ + /** + * Total number of clusters. + */ + @Since("0.8.0") def k: Int = clusterCenters.length - /** Returns the cluster index that a given point belongs to. */ + /** + * Returns the cluster index that a given point belongs to. + */ + @Since("0.8.0") def predict(point: Vector): Int = { KMeans.findClosest(clusterCentersWithNorm, new VectorWithNorm(point))._1 } - /** Maps given points to their cluster indices. */ + /** + * Maps given points to their cluster indices. + */ + @Since("1.0.0") def predict(points: RDD[Vector]): RDD[Int] = { val centersWithNorm = clusterCentersWithNorm val bcCentersWithNorm = points.context.broadcast(centersWithNorm) points.map(p => KMeans.findClosest(bcCentersWithNorm.value, new VectorWithNorm(p))._1) } - /** Maps given points to their cluster indices. */ + /** + * Maps given points to their cluster indices. + */ + @Since("1.0.0") def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] @@ -64,6 +81,7 @@ class KMeansModel ( * Return the K-means cost (sum of squared distances of points to their nearest center) for this * model on the given data. */ + @Since("0.8.0") def computeCost(data: RDD[Vector]): Double = { val centersWithNorm = clusterCentersWithNorm val bcCentersWithNorm = data.context.broadcast(centersWithNorm) @@ -73,6 +91,7 @@ class KMeansModel ( private def clusterCentersWithNorm: Iterable[VectorWithNorm] = clusterCenters.map(new VectorWithNorm(_)) + @Since("1.4.0") override def save(sc: SparkContext, path: String): Unit = { KMeansModel.SaveLoadV1_0.save(sc, this, path) } @@ -80,7 +99,10 @@ class KMeansModel ( override protected def formatVersion: String = "1.0" } +@Since("1.4.0") object KMeansModel extends Loader[KMeansModel] { + + @Since("1.4.0") override def load(sc: SparkContext, path: String): KMeansModel = { KMeansModel.SaveLoadV1_0.load(sc, path) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index ab124e6d77c5..92a321afb0ca 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering import breeze.linalg.{DenseVector => BDV} import org.apache.spark.Logging -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.api.java.JavaPairRDD import org.apache.spark.graphx._ import org.apache.spark.mllib.linalg.{Vector, Vectors} @@ -44,6 +44,7 @@ import org.apache.spark.util.Utils * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation * (Wikipedia)]] */ +@Since("1.3.0") @Experimental class LDA private ( private var k: Int, @@ -54,19 +55,26 @@ class LDA private ( private var checkpointInterval: Int, private var ldaOptimizer: LDAOptimizer) extends Logging { + /** + * Constructs a LDA instance with default parameters. + */ + @Since("1.3.0") def this() = this(k = 10, maxIterations = 20, docConcentration = Vectors.dense(-1), topicConcentration = -1, seed = Utils.random.nextLong(), checkpointInterval = 10, ldaOptimizer = new EMLDAOptimizer) /** * Number of topics to infer. I.e., the number of soft cluster centers. + * */ + @Since("1.3.0") def getK: Int = k /** * Number of topics to infer. I.e., the number of soft cluster centers. * (default = 10) */ + @Since("1.3.0") def setK(k: Int): this.type = { require(k > 0, s"LDA k (number of clusters) must be > 0, but was set to $k") this.k = k @@ -79,7 +87,26 @@ class LDA private ( * * This is the parameter to a Dirichlet distribution. */ - def getDocConcentration: Vector = this.docConcentration + @Since("1.5.0") + def getAsymmetricDocConcentration: Vector = this.docConcentration + + /** + * Concentration parameter (commonly named "alpha") for the prior placed on documents' + * distributions over topics ("theta"). + * + * This method assumes the Dirichlet distribution is symmetric and can be described by a single + * [[Double]] parameter. It should fail if docConcentration is asymmetric. + */ + @Since("1.3.0") + def getDocConcentration: Double = { + val parameter = docConcentration(0) + if (docConcentration.size == 1) { + parameter + } else { + require(docConcentration.toArray.forall(_ == parameter)) + parameter + } + } /** * Concentration parameter (commonly named "alpha") for the prior placed on documents' @@ -105,24 +132,44 @@ class LDA private ( * - default = uniformly (1.0 / k), following the implementation from * [[https://github.com/Blei-Lab/onlineldavb]]. */ + @Since("1.5.0") def setDocConcentration(docConcentration: Vector): this.type = { + require(docConcentration.size > 0, "docConcentration must have > 0 elements") this.docConcentration = docConcentration this } - /** Replicates Double to create a symmetric prior */ + /** + * Replicates a [[Double]] docConcentration to create a symmetric prior. + */ + @Since("1.3.0") def setDocConcentration(docConcentration: Double): this.type = { this.docConcentration = Vectors.dense(docConcentration) this } - /** Alias for [[getDocConcentration]] */ - def getAlpha: Vector = getDocConcentration + /** + * Alias for [[getAsymmetricDocConcentration]] + */ + @Since("1.5.0") + def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration + + /** + * Alias for [[getDocConcentration]] + */ + @Since("1.3.0") + def getAlpha: Double = getDocConcentration - /** Alias for [[setDocConcentration()]] */ + /** + * Alias for [[setDocConcentration()]] + */ + @Since("1.5.0") def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha) - /** Alias for [[setDocConcentration()]] */ + /** + * Alias for [[setDocConcentration()]] + */ + @Since("1.3.0") def setAlpha(alpha: Double): this.type = setDocConcentration(alpha) /** @@ -134,6 +181,7 @@ class LDA private ( * Note: The topics' distributions over terms are called "beta" in the original LDA paper * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009. */ + @Since("1.3.0") def getTopicConcentration: Double = this.topicConcentration /** @@ -158,35 +206,50 @@ class LDA private ( * - default = (1.0 / k), following the implementation from * [[https://github.com/Blei-Lab/onlineldavb]]. */ + @Since("1.3.0") def setTopicConcentration(topicConcentration: Double): this.type = { this.topicConcentration = topicConcentration this } - /** Alias for [[getTopicConcentration]] */ + /** + * Alias for [[getTopicConcentration]] + */ + @Since("1.3.0") def getBeta: Double = getTopicConcentration - /** Alias for [[setTopicConcentration()]] */ + /** + * Alias for [[setTopicConcentration()]] + */ + @Since("1.3.0") def setBeta(beta: Double): this.type = setTopicConcentration(beta) /** * Maximum number of iterations for learning. */ + @Since("1.3.0") def getMaxIterations: Int = maxIterations /** * Maximum number of iterations for learning. * (default = 20) */ + @Since("1.3.0") def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this } - /** Random seed */ + /** + * Random seed + */ + @Since("1.3.0") def getSeed: Long = seed - /** Random seed */ + /** + * Random seed + */ + @Since("1.3.0") def setSeed(seed: Long): this.type = { this.seed = seed this @@ -195,6 +258,7 @@ class LDA private ( /** * Period (in iterations) between checkpoints. */ + @Since("1.3.0") def getCheckpointInterval: Int = checkpointInterval /** @@ -205,6 +269,7 @@ class LDA private ( * * @see [[org.apache.spark.SparkContext#setCheckpointDir]] */ + @Since("1.3.0") def setCheckpointInterval(checkpointInterval: Int): this.type = { this.checkpointInterval = checkpointInterval this @@ -216,6 +281,7 @@ class LDA private ( * * LDAOptimizer used to perform the actual calculation */ + @Since("1.4.0") @DeveloperApi def getOptimizer: LDAOptimizer = ldaOptimizer @@ -224,6 +290,7 @@ class LDA private ( * * LDAOptimizer used to perform the actual calculation (default = EMLDAOptimizer) */ + @Since("1.4.0") @DeveloperApi def setOptimizer(optimizer: LDAOptimizer): this.type = { this.ldaOptimizer = optimizer @@ -234,6 +301,7 @@ class LDA private ( * Set the LDAOptimizer used to perform the actual calculation by algorithm name. * Currently "em", "online" are supported. */ + @Since("1.4.0") def setOptimizer(optimizerName: String): this.type = { this.ldaOptimizer = optimizerName.toLowerCase match { @@ -254,6 +322,7 @@ class LDA private ( * Document IDs must be unique and >= 0. * @return Inferred LDA model */ + @Since("1.3.0") def run(documents: RDD[(Long, Vector)]): LDAModel = { val state = ldaOptimizer.initialize(documents, this) var iter = 0 @@ -268,7 +337,10 @@ class LDA private ( state.getLDAModel(iterationTimes) } - /** Java-friendly version of [[run()]] */ + /** + * Java-friendly version of [[run()]] + */ + @Since("1.3.0") def run(documents: JavaPairRDD[java.lang.Long, Vector]): LDAModel = { run(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 6af90d7287ff..15129e0dd5a9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argtopk, normalize, sum} +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax, argtopk, normalize, sum} import breeze.numerics.{exp, lgamma} import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats @@ -25,8 +25,8 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.Experimental -import org.apache.spark.api.java.JavaPairRDD +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.api.java.{JavaPairRDD, JavaRDD} import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.{Loader, Saveable} @@ -43,12 +43,15 @@ import org.apache.spark.util.BoundedPriorityQueue * including local and distributed data structures. */ @Experimental +@Since("1.3.0") abstract class LDAModel private[clustering] extends Saveable { /** Number of topics */ + @Since("1.3.0") def k: Int /** Vocabulary size (number of terms or terms in the vocabulary) */ + @Since("1.3.0") def vocabSize: Int /** @@ -57,6 +60,7 @@ abstract class LDAModel private[clustering] extends Saveable { * * This is the parameter to a Dirichlet distribution. */ + @Since("1.5.0") def docConcentration: Vector /** @@ -68,6 +72,7 @@ abstract class LDAModel private[clustering] extends Saveable { * Note: The topics' distributions over terms are called "beta" in the original LDA paper * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009. */ + @Since("1.5.0") def topicConcentration: Double /** @@ -81,6 +86,7 @@ abstract class LDAModel private[clustering] extends Saveable { * This is a matrix of size vocabSize x k, where each column is a topic. * No guarantees are given about the ordering of the topics. */ + @Since("1.3.0") def topicsMatrix: Matrix /** @@ -91,6 +97,7 @@ abstract class LDAModel private[clustering] extends Saveable { * (term indices, term weights in topic). * Each topic's terms are sorted in order of decreasing weight. */ + @Since("1.3.0") def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] /** @@ -102,6 +109,7 @@ abstract class LDAModel private[clustering] extends Saveable { * (term indices, term weights in topic). * Each topic's terms are sorted in order of decreasing weight. */ + @Since("1.3.0") def describeTopics(): Array[(Array[Int], Array[Double])] = describeTopics(vocabSize) /* TODO (once LDA can be trained with Strings or given a dictionary) @@ -185,18 +193,24 @@ abstract class LDAModel private[clustering] extends Saveable { * @param topics Inferred topics (vocabSize x k matrix). */ @Experimental +@Since("1.3.0") class LocalLDAModel private[clustering] ( - val topics: Matrix, - override val docConcentration: Vector, - override val topicConcentration: Double, - override protected[clustering] val gammaShape: Double) extends LDAModel with Serializable { + @Since("1.3.0") val topics: Matrix, + @Since("1.5.0") override val docConcentration: Vector, + @Since("1.5.0") override val topicConcentration: Double, + override protected[clustering] val gammaShape: Double = 100) + extends LDAModel with Serializable { + @Since("1.3.0") override def k: Int = topics.numCols + @Since("1.3.0") override def vocabSize: Int = topics.numRows + @Since("1.3.0") override def topicsMatrix: Matrix = topics + @Since("1.3.0") override def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] = { val brzTopics = topics.toBreeze.toDenseMatrix Range(0, k).map { topicIndex => @@ -209,6 +223,7 @@ class LocalLDAModel private[clustering] ( override protected def formatVersion = "1.0" + @Since("1.5.0") override def save(sc: SparkContext, path: String): Unit = { LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration, gammaShape) @@ -217,26 +232,44 @@ class LocalLDAModel private[clustering] ( // TODO: declare in LDAModel and override once implemented in DistributedLDAModel /** * Calculates a lower bound on the log likelihood of the entire corpus. + * + * See Equation (16) in original Online LDA paper. + * * @param documents test corpus to use for calculating log likelihood * @return variational lower bound on the log likelihood of the entire corpus */ - def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents, + @Since("1.5.0") + def logLikelihood(documents: RDD[(Long, Vector)]): Double = logLikelihoodBound(documents, docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize) /** - * Calculate an upper bound bound on perplexity. See Equation (16) in original Online - * LDA paper. + * Java-friendly version of [[logLikelihood]] + */ + @Since("1.5.0") + def logLikelihood(documents: JavaPairRDD[java.lang.Long, Vector]): Double = { + logLikelihood(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) + } + + /** + * Calculate an upper bound bound on perplexity. (Lower is better.) + * See Equation (16) in original Online LDA paper. + * * @param documents test corpus to use for calculating perplexity - * @return variational upper bound on log perplexity per word + * @return Variational upper bound on log perplexity per token. */ + @Since("1.5.0") def logPerplexity(documents: RDD[(Long, Vector)]): Double = { - val corpusWords = documents + val corpusTokenCount = documents .map { case (_, termCounts) => termCounts.toArray.sum } .sum() - val perWordBound = -logLikelihood(documents) / corpusWords + -logLikelihood(documents) / corpusTokenCount + } - perWordBound + /** Java-friendly version of [[logPerplexity]] */ + @Since("1.5.0") + def logPerplexity(documents: JavaPairRDD[java.lang.Long, Vector]): Double = { + logPerplexity(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) } /** @@ -244,17 +277,20 @@ class LocalLDAModel private[clustering] ( * log p(documents) >= E_q[log p(documents)] - E_q[log q(documents)] * This bound is derived by decomposing the LDA model to: * log p(documents) = E_q[log p(documents)] - E_q[log q(documents)] + D(q|p) - * and noting that the KL-divergence D(q|p) >= 0. See Equation (16) in original Online LDA paper. + * and noting that the KL-divergence D(q|p) >= 0. + * + * See Equation (16) in original Online LDA paper, as well as Appendix A.3 in the JMLR version of + * the original LDA paper. * @param documents a subset of the test corpus * @param alpha document-topic Dirichlet prior parameters - * @param eta topic-word Dirichlet prior parameters + * @param eta topic-word Dirichlet prior parameter * @param lambda parameters for variational q(beta | lambda) topic-word distributions * @param gammaShape shape parameter for random initialization of variational q(theta | gamma) * topic mixture distributions * @param k number of topics * @param vocabSize number of unique terms in the entire test corpus */ - private def bound( + private def logLikelihoodBound( documents: RDD[(Long, Vector)], alpha: Vector, eta: Double, @@ -266,33 +302,38 @@ class LocalLDAModel private[clustering] ( // transpose because dirichletExpectation normalizes by row and we need to normalize // by topic (columns of lambda) val Elogbeta = LDAUtils.dirichletExpectation(lambda.t).t + val ElogbetaBc = documents.sparkContext.broadcast(Elogbeta) + + // Sum bound components for each document: + // component for prob(tokens) + component for prob(document-topic distribution) + val corpusPart = + documents.filter(_._2.numNonzeros > 0).map { case (id: Long, termCounts: Vector) => + val localElogbeta = ElogbetaBc.value + var docBound = 0.0D + val (gammad: BDV[Double], _) = OnlineLDAOptimizer.variationalTopicInference( + termCounts, exp(localElogbeta), brzAlpha, gammaShape, k) + val Elogthetad: BDV[Double] = LDAUtils.dirichletExpectation(gammad) + + // E[log p(doc | theta, beta)] + termCounts.foreachActive { case (idx, count) => + docBound += count * LDAUtils.logSumExp(Elogthetad + localElogbeta(idx, ::).t) + } + // E[log p(theta | alpha) - log q(theta | gamma)] + docBound += sum((brzAlpha - gammad) :* Elogthetad) + docBound += sum(lgamma(gammad) - lgamma(brzAlpha)) + docBound += lgamma(sum(brzAlpha)) - lgamma(sum(gammad)) - var score = documents.filter(_._2.numNonzeros > 0).map { case (id: Long, termCounts: Vector) => - var docScore = 0.0D - val (gammad: BDV[Double], _) = OnlineLDAOptimizer.variationalTopicInference( - termCounts, exp(Elogbeta), brzAlpha, gammaShape, k) - val Elogthetad: BDV[Double] = LDAUtils.dirichletExpectation(gammad) - - // E[log p(doc | theta, beta)] - termCounts.foreachActive { case (idx, count) => - docScore += count * LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t) - } - // E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector - docScore += sum((brzAlpha - gammad) :* Elogthetad) - docScore += sum(lgamma(gammad) - lgamma(brzAlpha)) - docScore += lgamma(sum(brzAlpha)) - lgamma(sum(gammad)) - - docScore - }.sum() - - // E[log p(beta | eta) - log q (beta | lambda)]; assumes eta is a scalar - score += sum((eta - lambda) :* Elogbeta) - score += sum(lgamma(lambda) - lgamma(eta)) + docBound + }.sum() + // Bound component for prob(topic-term distributions): + // E[log p(beta | eta) - log q(beta | lambda)] val sumEta = eta * vocabSize - score += sum(lgamma(sumEta) - lgamma(sum(lambda(::, breeze.linalg.*)))) + val topicsPart = sum((eta - lambda) :* Elogbeta) + + sum(lgamma(lambda) - lgamma(eta)) + + sum(lgamma(sumEta) - lgamma(sum(lambda(::, breeze.linalg.*)))) - score + corpusPart + topicsPart } /** @@ -305,11 +346,13 @@ class LocalLDAModel private[clustering] ( * @param documents documents to predict topic mixture distributions for * @return An RDD of (document ID, topic mixture distribution for document) */ + @Since("1.3.0") // TODO: declare in LDAModel and override once implemented in DistributedLDAModel def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = { // Double transpose because dirichletExpectation normalizes by row and we need to normalize // by topic (columns of lambda) val expElogbeta = exp(LDAUtils.dirichletExpectation(topicsMatrix.toBreeze.toDenseMatrix.t).t) + val expElogbetaBc = documents.sparkContext.broadcast(expElogbeta) val docConcentrationBrz = this.docConcentration.toBreeze val gammaShape = this.gammaShape val k = this.k @@ -320,7 +363,7 @@ class LocalLDAModel private[clustering] ( } else { val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference( termCounts, - expElogbeta, + expElogbetaBc.value, docConcentrationBrz, gammaShape, k) @@ -329,10 +372,20 @@ class LocalLDAModel private[clustering] ( } } -} + /** + * Java-friendly version of [[topicDistributions]] + */ + @Since("1.4.1") + def topicDistributions( + documents: JavaPairRDD[java.lang.Long, Vector]): JavaPairRDD[java.lang.Long, Vector] = { + val distributions = topicDistributions(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) + JavaPairRDD.fromRDD(distributions.asInstanceOf[RDD[(java.lang.Long, Vector)]]) + } +} @Experimental +@Since("1.5.0") object LocalLDAModel extends Loader[LocalLDAModel] { private object SaveLoadV1_0 { @@ -384,7 +437,7 @@ object LocalLDAModel extends Loader[LocalLDAModel] { Loader.checkSchema[Data](dataFrame.schema) val topics = dataFrame.collect() val vocabSize = topics(0).getAs[Vector](0).size - val k = topics.size + val k = topics.length val brzTopics = BDM.zeros[Double](vocabSize, k) topics.foreach { case Row(vec: Vector, ind: Int) => @@ -392,11 +445,11 @@ object LocalLDAModel extends Loader[LocalLDAModel] { } val topicsMat = Matrices.fromBreeze(brzTopics) - // TODO: initialize with docConcentration, topicConcentration, and gammaShape after SPARK-9940 new LocalLDAModel(topicsMat, docConcentration, topicConcentration, gammaShape) } } + @Since("1.5.0") override def load(sc: SparkContext, path: String): LocalLDAModel = { val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path) implicit val formats = DefaultFormats @@ -436,15 +489,17 @@ object LocalLDAModel extends Loader[LocalLDAModel] { * than the [[LocalLDAModel]]. */ @Experimental +@Since("1.3.0") class DistributedLDAModel private[clustering] ( private[clustering] val graph: Graph[LDA.TopicCounts, LDA.TokenCount], private[clustering] val globalTopicTotals: LDA.TopicCounts, - val k: Int, - val vocabSize: Int, - override val docConcentration: Vector, - override val topicConcentration: Double, - override protected[clustering] val gammaShape: Double, - private[spark] val iterationTimes: Array[Double]) extends LDAModel { + @Since("1.3.0") val k: Int, + @Since("1.3.0") val vocabSize: Int, + @Since("1.5.0") override val docConcentration: Vector, + @Since("1.5.0") override val topicConcentration: Double, + private[spark] val iterationTimes: Array[Double], + override protected[clustering] val gammaShape: Double = 100) + extends LDAModel { import LDA._ @@ -453,6 +508,7 @@ class DistributedLDAModel private[clustering] ( * The local model stores the inferred topics but not the topic distributions for training * documents. */ + @Since("1.3.0") def toLocal: LocalLDAModel = new LocalLDAModel(topicsMatrix, docConcentration, topicConcentration, gammaShape) @@ -463,6 +519,7 @@ class DistributedLDAModel private[clustering] ( * * WARNING: This matrix is collected from an RDD. Beware memory usage when vocabSize, k are large. */ + @Since("1.3.0") override lazy val topicsMatrix: Matrix = { // Collect row-major topics val termTopicCounts: Array[(Int, TopicCounts)] = @@ -481,6 +538,7 @@ class DistributedLDAModel private[clustering] ( Matrices.fromBreeze(brzTopics) } + @Since("1.3.0") override def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] = { val numTopics = k // Note: N_k is not needed to find the top terms, but it is needed to normalize weights @@ -520,6 +578,7 @@ class DistributedLDAModel private[clustering] ( * (IDs for the documents, weights of the topic in these documents). * For each topic, documents are sorted in order of decreasing topic weights. */ + @Since("1.5.0") def topDocumentsPerTopic(maxDocumentsPerTopic: Int): Array[(Array[Long], Array[Double])] = { val numTopics = k val topicsInQueues: Array[BoundedPriorityQueue[(Double, Long)]] = @@ -546,6 +605,52 @@ class DistributedLDAModel private[clustering] ( } } + /** + * Return the top topic for each (doc, term) pair. I.e., for each document, what is the most + * likely topic generating each term? + * + * @return RDD of (doc ID, assignment of top topic index for each term), + * where the assignment is specified via a pair of zippable arrays + * (term indices, topic indices). Note that terms will be omitted if not present in + * the document. + */ + @Since("1.5.0") + lazy val topicAssignments: RDD[(Long, Array[Int], Array[Int])] = { + // For reference, compare the below code with the core part of EMLDAOptimizer.next(). + val eta = topicConcentration + val W = vocabSize + val alpha = docConcentration(0) + val N_k = globalTopicTotals + val sendMsg: EdgeContext[TopicCounts, TokenCount, (Array[Int], Array[Int])] => Unit = + (edgeContext) => { + // E-STEP: Compute gamma_{wjk} (smoothed topic distributions). + val scaledTopicDistribution: TopicCounts = + computePTopic(edgeContext.srcAttr, edgeContext.dstAttr, N_k, W, eta, alpha) + // For this (doc j, term w), send top topic k to doc vertex. + val topTopic: Int = argmax(scaledTopicDistribution) + val term: Int = index2term(edgeContext.dstId) + edgeContext.sendToSrc((Array(term), Array(topTopic))) + } + val mergeMsg: ((Array[Int], Array[Int]), (Array[Int], Array[Int])) => (Array[Int], Array[Int]) = + (terms_topics0, terms_topics1) => { + (terms_topics0._1 ++ terms_topics1._1, terms_topics0._2 ++ terms_topics1._2) + } + // M-STEP: Aggregation computes new N_{kj}, N_{wk} counts. + val perDocAssignments = + graph.aggregateMessages[(Array[Int], Array[Int])](sendMsg, mergeMsg).filter(isDocumentVertex) + perDocAssignments.map { case (docID: Long, (terms: Array[Int], topics: Array[Int])) => + // TODO: Avoid zip, which is inefficient. + val (sortedTerms, sortedTopics) = terms.zip(topics).sortBy(_._1).unzip + (docID, sortedTerms.toArray, sortedTopics.toArray) + } + } + + /** Java-friendly version of [[topicAssignments]] */ + @Since("1.5.0") + lazy val javaTopicAssignments: JavaRDD[(java.lang.Long, Array[Int], Array[Int])] = { + topicAssignments.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Int])]].toJavaRDD() + } + // TODO // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ??? @@ -559,6 +664,7 @@ class DistributedLDAModel private[clustering] ( * - Even with [[logPrior]], this is NOT the same as the data log likelihood given the * hyperparameters. */ + @Since("1.3.0") lazy val logLikelihood: Double = { // TODO: generalize this for asymmetric (non-scalar) alpha val alpha = this.docConcentration(0) // To avoid closure capture of enclosing object @@ -583,8 +689,9 @@ class DistributedLDAModel private[clustering] ( /** * Log probability of the current parameter estimate: - * log P(topics, topic distributions for docs | alpha, eta) + * log P(topics, topic distributions for docs | alpha, eta) */ + @Since("1.3.0") lazy val logPrior: Double = { // TODO: generalize this for asymmetric (non-scalar) alpha val alpha = this.docConcentration(0) // To avoid closure capture of enclosing object @@ -616,13 +723,17 @@ class DistributedLDAModel private[clustering] ( * * @return RDD of (document ID, topic distribution) pairs */ + @Since("1.3.0") def topicDistributions: RDD[(Long, Vector)] = { graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) => (docID.toLong, Vectors.fromBreeze(normalize(topicCounts, 1.0))) } } - /** Java-friendly version of [[topicDistributions]] */ + /** + * Java-friendly version of [[topicDistributions]] + */ + @Since("1.4.1") def javaTopicDistributions: JavaPairRDD[java.lang.Long, Vector] = { JavaPairRDD.fromRDD(topicDistributions.asInstanceOf[RDD[(java.lang.Long, Vector)]]) } @@ -631,6 +742,7 @@ class DistributedLDAModel private[clustering] ( * For each document, return the top k weighted topics for that document and their weights. * @return RDD of (doc ID, topic indices, topic weights) */ + @Since("1.5.0") def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = { graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) => val topIndices = argtopk(topicCounts, k) @@ -644,11 +756,24 @@ class DistributedLDAModel private[clustering] ( } } + /** + * Java-friendly version of [[topTopicsPerDocument]] + */ + @Since("1.5.0") + def javaTopTopicsPerDocument(k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[Double])] = { + val topics = topTopicsPerDocument(k) + topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Double])]].toJavaRDD() + } + // TODO: // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ??? override protected def formatVersion = "1.0" + /** + * Java-friendly version of [[topicDistributions]] + */ + @Since("1.5.0") override def save(sc: SparkContext, path: String): Unit = { DistributedLDAModel.SaveLoadV1_0.save( sc, path, graph, globalTopicTotals, k, vocabSize, docConcentration, topicConcentration, @@ -658,6 +783,7 @@ class DistributedLDAModel private[clustering] ( @Experimental +@Since("1.5.0") object DistributedLDAModel extends Loader[DistributedLDAModel] { private object SaveLoadV1_0 { @@ -744,11 +870,12 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] { val graph: Graph[LDA.TopicCounts, LDA.TokenCount] = Graph(vertices, edges) new DistributedLDAModel(graph, globalTopicTotals, globalTopicTotals.length, vocabSize, - docConcentration, topicConcentration, gammaShape, iterationTimes) + docConcentration, topicConcentration, iterationTimes, gammaShape) } } + @Since("1.5.0") override def load(sc: SparkContext, path: String): DistributedLDAModel = { val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path) implicit val formats = DefaultFormats @@ -762,10 +889,9 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] { val classNameV1_0 = SaveLoadV1_0.thisClassName val model = (loadedClassName, loadedVersion) match { - case (className, "1.0") if className == classNameV1_0 => { + case (className, "1.0") if className == classNameV1_0 => DistributedLDAModel.SaveLoadV1_0.load(sc, path, vocabSize, docConcentration, topicConcentration, iterationTimes.toArray, gammaShape) - } case _ => throw new Exception( s"DistributedLDAModel.load did not recognize model with (className, format version):" + s"($loadedClassName, $loadedVersion). Supported: ($classNameV1_0, 1.0)") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index b0e14cb8296a..38486e949bbc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -23,7 +23,7 @@ import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, all, normalize, su import breeze.numerics.{trigamma, abs, exp} import breeze.stats.distributions.{Gamma, RandBasis} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.graphx._ import org.apache.spark.graphx.impl.GraphImpl import org.apache.spark.mllib.impl.PeriodicGraphCheckpointer @@ -36,6 +36,7 @@ import org.apache.spark.rdd.RDD * An LDAOptimizer specifies which optimization/learning/inference algorithm to use, and it can * hold optimizer-specific parameters for users to set. */ +@Since("1.4.0") @DeveloperApi sealed trait LDAOptimizer { @@ -73,8 +74,8 @@ sealed trait LDAOptimizer { * - Paper which clearly explains several algorithms, including EM: * Asuncion, Welling, Smyth, and Teh. * "On Smoothing and Inference for Topic Models." UAI, 2009. - * */ +@Since("1.4.0") @DeveloperApi final class EMLDAOptimizer extends LDAOptimizer { @@ -95,10 +96,8 @@ final class EMLDAOptimizer extends LDAOptimizer { * Compute bipartite term/doc graph. */ override private[clustering] def initialize(docs: RDD[(Long, Vector)], lda: LDA): LDAOptimizer = { - val docConcentration = lda.getDocConcentration(0) - require({ - lda.getDocConcentration.toArray.forall(_ == docConcentration) - }, "EMLDAOptimizer currently only supports symmetric document-topic priors") + // EMLDAOptimizer currently only supports symmetric document-topic priors + val docConcentration = lda.getDocConcentration val topicConcentration = lda.getTopicConcentration val k = lda.getK @@ -168,7 +167,7 @@ final class EMLDAOptimizer extends LDAOptimizer { edgeContext.sendToDst((false, scaledTopicDistribution)) edgeContext.sendToSrc((false, scaledTopicDistribution)) } - // This is a hack to detect whether we could modify the values in-place. + // The Boolean is a hack to detect whether we could modify the values in-place. // TODO: Add zero/seqOp/combOp option to aggregateMessages. (SPARK-5438) val mergeMsg: ((Boolean, TopicCounts), (Boolean, TopicCounts)) => (Boolean, TopicCounts) = (m0, m1) => { @@ -209,11 +208,11 @@ final class EMLDAOptimizer extends LDAOptimizer { override private[clustering] def getLDAModel(iterationTimes: Array[Double]): LDAModel = { require(graph != null, "graph is null, EMLDAOptimizer not initialized.") this.graphCheckpointer.deleteAllCheckpoints() - // This assumes gammaShape = 100 in OnlineLDAOptimizer to ensure equivalence in LDAModel.toLocal - // conversion + // The constructor's default arguments assume gammaShape = 100 to ensure equivalence in + // LDAModel.toLocal conversion new DistributedLDAModel(this.graph, this.globalTopicTotals, this.k, this.vocabSize, Vectors.dense(Array.fill(this.k)(this.docConcentration)), this.topicConcentration, - 100, iterationTimes) + iterationTimes) } } @@ -228,6 +227,7 @@ final class EMLDAOptimizer extends LDAOptimizer { * Original Online LDA paper: * Hoffman, Blei and Bach, "Online Learning for Latent Dirichlet Allocation." NIPS, 2010. */ +@Since("1.4.0") @DeveloperApi final class OnlineLDAOptimizer extends LDAOptimizer { @@ -258,7 +258,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { private var tau0: Double = 1024 private var kappa: Double = 0.51 private var miniBatchFraction: Double = 0.05 - private var optimizeAlpha: Boolean = false + private var optimizeDocConcentration: Boolean = false // internal data structure private var docs: RDD[(Long, Vector)] = null @@ -277,6 +277,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { * A (positive) learning parameter that downweights early iterations. Larger values make early * iterations count less. */ + @Since("1.4.0") def getTau0: Double = this.tau0 /** @@ -284,6 +285,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { * iterations count less. * Default: 1024, following the original Online LDA paper. */ + @Since("1.4.0") def setTau0(tau0: Double): this.type = { require(tau0 > 0, s"LDA tau0 must be positive, but was set to $tau0") this.tau0 = tau0 @@ -293,6 +295,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { /** * Learning rate: exponential decay rate */ + @Since("1.4.0") def getKappa: Double = this.kappa /** @@ -300,6 +303,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { * (0.5, 1.0] to guarantee asymptotic convergence. * Default: 0.51, based on the original Online LDA paper. */ + @Since("1.4.0") def setKappa(kappa: Double): this.type = { require(kappa >= 0, s"Online LDA kappa must be nonnegative, but was set to $kappa") this.kappa = kappa @@ -309,6 +313,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { /** * Mini-batch fraction, which sets the fraction of document sampled and used in each iteration */ + @Since("1.4.0") def getMiniBatchFraction: Double = this.miniBatchFraction /** @@ -321,6 +326,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { * * Default: 0.05, i.e., 5% of total documents. */ + @Since("1.4.0") def setMiniBatchFraction(miniBatchFraction: Double): this.type = { require(miniBatchFraction > 0.0 && miniBatchFraction <= 1.0, s"Online LDA miniBatchFraction must be in range (0,1], but was set to $miniBatchFraction") @@ -329,18 +335,20 @@ final class OnlineLDAOptimizer extends LDAOptimizer { } /** - * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution) - * will be optimized during training. + * Optimize docConcentration, indicates whether docConcentration (Dirichlet parameter for + * document-topic distribution) will be optimized during training. */ - def getOptimzeAlpha: Boolean = this.optimizeAlpha + @Since("1.5.0") + def getOptimizeDocConcentration: Boolean = this.optimizeDocConcentration /** - * Sets whether to optimize alpha parameter during training. + * Sets whether to optimize docConcentration parameter during training. * * Default: false */ - def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = { - this.optimizeAlpha = optimizeAlpha + @Since("1.5.0") + def setOptimizeDocConcentration(optimizeDocConcentration: Boolean): this.type = { + this.optimizeDocConcentration = optimizeDocConcentration this } @@ -378,18 +386,20 @@ final class OnlineLDAOptimizer extends LDAOptimizer { this.k = lda.getK this.corpusSize = docs.count() this.vocabSize = docs.first()._2.size - this.alpha = if (lda.getDocConcentration.size == 1) { - if (lda.getDocConcentration(0) == -1) Vectors.dense(Array.fill(k)(1.0 / k)) + this.alpha = if (lda.getAsymmetricDocConcentration.size == 1) { + if (lda.getAsymmetricDocConcentration(0) == -1) Vectors.dense(Array.fill(k)(1.0 / k)) else { - require(lda.getDocConcentration(0) >= 0, s"all entries in alpha must be >=0, got: $alpha") - Vectors.dense(Array.fill(k)(lda.getDocConcentration(0))) + require(lda.getAsymmetricDocConcentration(0) >= 0, + s"all entries in alpha must be >=0, got: $alpha") + Vectors.dense(Array.fill(k)(lda.getAsymmetricDocConcentration(0))) } } else { - require(lda.getDocConcentration.size == k, s"alpha must have length k, got: $alpha") - lda.getDocConcentration.foreachActive { case (_, x) => + require(lda.getAsymmetricDocConcentration.size == k, + s"alpha must have length k, got: $alpha") + lda.getAsymmetricDocConcentration.foreachActive { case (_, x) => require(x >= 0, s"all entries in alpha must be >= 0, got: $alpha") } - lda.getDocConcentration + lda.getAsymmetricDocConcentration } this.eta = if (lda.getTopicConcentration == -1) 1.0 / k else lda.getTopicConcentration this.randomGenerator = new Random(lda.getSeed) @@ -419,6 +429,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { val k = this.k val vocabSize = this.vocabSize val expElogbeta = exp(LDAUtils.dirichletExpectation(lambda)).t + val expElogbetaBc = batch.sparkContext.broadcast(expElogbeta) val alpha = this.alpha.toBreeze val gammaShape = this.gammaShape @@ -433,20 +444,21 @@ final class OnlineLDAOptimizer extends LDAOptimizer { case v: SparseVector => v.indices.toList } val (gammad, sstats) = OnlineLDAOptimizer.variationalTopicInference( - termCounts, expElogbeta, alpha, gammaShape, k) + termCounts, expElogbetaBc.value, alpha, gammaShape, k) stat(::, ids) := stat(::, ids).toDenseMatrix + sstats gammaPart = gammad :: gammaPart } Iterator((stat, gammaPart)) } val statsSum: BDM[Double] = stats.map(_._1).reduce(_ += _) + expElogbetaBc.unpersist() val gammat: BDM[Double] = breeze.linalg.DenseMatrix.vertcat( stats.map(_._2).reduce(_ ++ _).map(_.toDenseMatrix): _*) val batchResult = statsSum :* expElogbeta.t // Note that this is an optimization to avoid batch.count updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt) - if (optimizeAlpha) updateAlpha(gammat) + if (optimizeDocConcentration) updateAlpha(gammat) this } @@ -540,21 +552,22 @@ private[clustering] object OnlineLDAOptimizer { val expElogthetad: BDV[Double] = exp(LDAUtils.dirichletExpectation(gammad)) // K val expElogbetad = expElogbeta(ids, ::).toDenseMatrix // ids * K - val phinorm: BDV[Double] = expElogbetad * expElogthetad :+ 1e-100 // ids - var meanchange = 1D + val phiNorm: BDV[Double] = expElogbetad * expElogthetad :+ 1e-100 // ids + var meanGammaChange = 1D val ctsVector = new BDV[Double](cts) // ids // Iterate between gamma and phi until convergence - while (meanchange > 1e-3) { + while (meanGammaChange > 1e-3) { val lastgamma = gammad.copy // K K * ids ids - gammad := (expElogthetad :* (expElogbetad.t * (ctsVector :/ phinorm))) :+ alpha + gammad := (expElogthetad :* (expElogbetad.t * (ctsVector :/ phiNorm))) :+ alpha expElogthetad := exp(LDAUtils.dirichletExpectation(gammad)) - phinorm := expElogbetad * expElogthetad :+ 1e-100 - meanchange = sum(abs(gammad - lastgamma)) / k + // TODO: Keep more values in log space, and only exponentiate when needed. + phiNorm := expElogbetad * expElogthetad :+ 1e-100 + meanGammaChange = sum(abs(gammad - lastgamma)) / k } - val sstatsd = expElogthetad.asDenseMatrix.t * (ctsVector :/ phinorm).asDenseMatrix + val sstatsd = expElogthetad.asDenseMatrix.t * (ctsVector :/ phiNorm).asDenseMatrix (gammad, sstatsd) } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala index f7e5ce1665fe..a9ba7b60bad0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala @@ -22,7 +22,7 @@ import breeze.numerics._ /** * Utility methods for LDA. */ -object LDAUtils { +private[clustering] object LDAUtils { /** * Log Sum Exp with overflow protection using the identity: * For any a: \log \sum_{n=1}^N \exp\{x_n\} = a + \log \sum_{n=1}^N \exp\{x_n - a\} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala index 407e43a024a2..6c76e26fd162 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala @@ -21,7 +21,7 @@ import org.json4s.JsonDSL._ import org.json4s._ import org.json4s.jackson.JsonMethods._ -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.graphx._ import org.apache.spark.graphx.impl.GraphImpl @@ -40,11 +40,14 @@ import org.apache.spark.{Logging, SparkContext, SparkException} * @param k number of clusters * @param assignments an RDD of clustering [[PowerIterationClustering#Assignment]]s */ +@Since("1.3.0") @Experimental -class PowerIterationClusteringModel( - val k: Int, - val assignments: RDD[PowerIterationClustering.Assignment]) extends Saveable with Serializable { +class PowerIterationClusteringModel @Since("1.3.0") ( + @Since("1.3.0") val k: Int, + @Since("1.3.0") val assignments: RDD[PowerIterationClustering.Assignment]) + extends Saveable with Serializable { + @Since("1.4.0") override def save(sc: SparkContext, path: String): Unit = { PowerIterationClusteringModel.SaveLoadV1_0.save(sc, this, path) } @@ -52,7 +55,10 @@ class PowerIterationClusteringModel( override protected def formatVersion: String = "1.0" } +@Since("1.4.0") object PowerIterationClusteringModel extends Loader[PowerIterationClusteringModel] { + + @Since("1.4.0") override def load(sc: SparkContext, path: String): PowerIterationClusteringModel = { PowerIterationClusteringModel.SaveLoadV1_0.load(sc, path) } @@ -65,6 +71,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode private[clustering] val thisClassName = "org.apache.spark.mllib.clustering.PowerIterationClusteringModel" + @Since("1.4.0") def save(sc: SparkContext, model: PowerIterationClusteringModel, path: String): Unit = { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ @@ -77,6 +84,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode dataRDD.write.parquet(Loader.dataPath(path)) } + @Since("1.4.0") def load(sc: SparkContext, path: String): PowerIterationClusteringModel = { implicit val formats = DefaultFormats val sqlContext = new SQLContext(sc) @@ -113,6 +121,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]] */ @Experimental +@Since("1.3.0") class PowerIterationClustering private[clustering] ( private var k: Int, private var maxIterations: Int, @@ -120,14 +129,17 @@ class PowerIterationClustering private[clustering] ( import org.apache.spark.mllib.clustering.PowerIterationClustering._ - /** Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100, - * initMode: "random"}. + /** + * Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100, + * initMode: "random"}. */ + @Since("1.3.0") def this() = this(k = 2, maxIterations = 100, initMode = "random") /** * Set the number of clusters. */ + @Since("1.3.0") def setK(k: Int): this.type = { this.k = k this @@ -136,6 +148,7 @@ class PowerIterationClustering private[clustering] ( /** * Set maximum number of iterations of the power iteration loop */ + @Since("1.3.0") def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this @@ -145,6 +158,7 @@ class PowerIterationClustering private[clustering] ( * Set the initialization mode. This can be either "random" to use a random vector * as vertex properties, or "degree" to use normalized sum similarities. Default: random. */ + @Since("1.3.0") def setInitializationMode(mode: String): this.type = { this.initMode = mode match { case "random" | "degree" => mode @@ -165,6 +179,7 @@ class PowerIterationClustering private[clustering] ( * * @return a [[PowerIterationClusteringModel]] that contains the clustering result */ + @Since("1.5.0") def run(graph: Graph[Double, Double]): PowerIterationClusteringModel = { val w = normalize(graph) val w0 = initMode match { @@ -186,6 +201,7 @@ class PowerIterationClustering private[clustering] ( * * @return a [[PowerIterationClusteringModel]] that contains the clustering result */ + @Since("1.3.0") def run(similarities: RDD[(Long, Long, Double)]): PowerIterationClusteringModel = { val w = normalize(similarities) val w0 = initMode match { @@ -198,6 +214,7 @@ class PowerIterationClustering private[clustering] ( /** * A Java-friendly version of [[PowerIterationClustering.run]]. */ + @Since("1.3.0") def run(similarities: JavaRDD[(java.lang.Long, java.lang.Long, java.lang.Double)]) : PowerIterationClusteringModel = { run(similarities.rdd.asInstanceOf[RDD[(Long, Long, Double)]]) @@ -221,6 +238,7 @@ class PowerIterationClustering private[clustering] ( } } +@Since("1.3.0") @Experimental object PowerIterationClustering extends Logging { @@ -230,6 +248,7 @@ object PowerIterationClustering extends Logging { * @param id node id * @param cluster assigned cluster id */ + @Since("1.3.0") @Experimental case class Assignment(id: Long, cluster: Int) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala index d9b34cec6489..1d50ffec96fa 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.rdd.RDD @@ -63,14 +63,18 @@ import org.apache.spark.util.random.XORShiftRandom * such that at time t + h the discount applied to the data from t is 0.5. * The definition remains the same whether the time unit is given * as batches or points. - * */ +@Since("1.2.0") @Experimental -class StreamingKMeansModel( - override val clusterCenters: Array[Vector], - val clusterWeights: Array[Double]) extends KMeansModel(clusterCenters) with Logging { +class StreamingKMeansModel @Since("1.2.0") ( + @Since("1.2.0") override val clusterCenters: Array[Vector], + @Since("1.2.0") val clusterWeights: Array[Double]) + extends KMeansModel(clusterCenters) with Logging { - /** Perform a k-means update on a batch of data. */ + /** + * Perform a k-means update on a batch of data. + */ + @Since("1.2.0") def update(data: RDD[Vector], decayFactor: Double, timeUnit: String): StreamingKMeansModel = { // find nearest cluster to each point @@ -82,6 +86,7 @@ class StreamingKMeansModel( (p1._1, p1._2 + p2._2) } val dim = clusterCenters(0).size + val pointStats: Array[(Int, (Vector, Long))] = closest .aggregateByKey((Vectors.zeros(dim), 0L))(mergeContribs, mergeContribs) .collect() @@ -162,29 +167,40 @@ class StreamingKMeansModel( * .trainOn(DStream) * }}} */ +@Since("1.2.0") @Experimental -class StreamingKMeans( - var k: Int, - var decayFactor: Double, - var timeUnit: String) extends Logging with Serializable { +class StreamingKMeans @Since("1.2.0") ( + @Since("1.2.0") var k: Int, + @Since("1.2.0") var decayFactor: Double, + @Since("1.2.0") var timeUnit: String) extends Logging with Serializable { + @Since("1.2.0") def this() = this(2, 1.0, StreamingKMeans.BATCHES) protected var model: StreamingKMeansModel = new StreamingKMeansModel(null, null) - /** Set the number of clusters. */ + /** + * Set the number of clusters. + */ + @Since("1.2.0") def setK(k: Int): this.type = { this.k = k this } - /** Set the decay factor directly (for forgetful algorithms). */ + /** + * Set the decay factor directly (for forgetful algorithms). + */ + @Since("1.2.0") def setDecayFactor(a: Double): this.type = { this.decayFactor = a this } - /** Set the half life and time unit ("batches" or "points") for forgetful algorithms. */ + /** + * Set the half life and time unit ("batches" or "points") for forgetful algorithms. + */ + @Since("1.2.0") def setHalfLife(halfLife: Double, timeUnit: String): this.type = { if (timeUnit != StreamingKMeans.BATCHES && timeUnit != StreamingKMeans.POINTS) { throw new IllegalArgumentException("Invalid time unit for decay: " + timeUnit) @@ -195,7 +211,10 @@ class StreamingKMeans( this } - /** Specify initial centers directly. */ + /** + * Specify initial centers directly. + */ + @Since("1.2.0") def setInitialCenters(centers: Array[Vector], weights: Array[Double]): this.type = { model = new StreamingKMeansModel(centers, weights) this @@ -208,6 +227,7 @@ class StreamingKMeans( * @param weight Weight for each center * @param seed Random seed */ + @Since("1.2.0") def setRandomCenters(dim: Int, weight: Double, seed: Long = Utils.random.nextLong): this.type = { val random = new XORShiftRandom(seed) val centers = Array.fill(k)(Vectors.dense(Array.fill(dim)(random.nextGaussian()))) @@ -216,7 +236,10 @@ class StreamingKMeans( this } - /** Return the latest model. */ + /** + * Return the latest model. + */ + @Since("1.2.0") def latestModel(): StreamingKMeansModel = { model } @@ -229,6 +252,7 @@ class StreamingKMeans( * * @param data DStream containing vector data */ + @Since("1.2.0") def trainOn(data: DStream[Vector]) { assertInitialized() data.foreachRDD { (rdd, time) => @@ -236,7 +260,10 @@ class StreamingKMeans( } } - /** Java-friendly version of `trainOn`. */ + /** + * Java-friendly version of `trainOn`. + */ + @Since("1.4.0") def trainOn(data: JavaDStream[Vector]): Unit = trainOn(data.dstream) /** @@ -245,12 +272,16 @@ class StreamingKMeans( * @param data DStream containing vector data * @return DStream containing predictions */ + @Since("1.2.0") def predictOn(data: DStream[Vector]): DStream[Int] = { assertInitialized() data.map(model.predict) } - /** Java-friendly version of `predictOn`. */ + /** + * Java-friendly version of `predictOn`. + */ + @Since("1.4.0") def predictOn(data: JavaDStream[Vector]): JavaDStream[java.lang.Integer] = { JavaDStream.fromDStream(predictOn(data.dstream).asInstanceOf[DStream[java.lang.Integer]]) } @@ -262,12 +293,16 @@ class StreamingKMeans( * @tparam K key type * @return DStream containing the input keys and the predictions as values */ + @Since("1.2.0") def predictOnValues[K: ClassTag](data: DStream[(K, Vector)]): DStream[(K, Int)] = { assertInitialized() data.mapValues(model.predict) } - /** Java-friendly version of `predictOnValues`. */ + /** + * Java-friendly version of `predictOnValues`. + */ + @Since("1.4.0") def predictOnValues[K]( data: JavaPairDStream[K, Vector]): JavaPairDStream[K, java.lang.Integer] = { implicit val tag = fakeClassTag[K] diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index c1d1a224817e..508fe532b130 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.evaluation -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.mllib.evaluation.binary._ @@ -42,16 +42,18 @@ import org.apache.spark.sql.DataFrame * be smaller as a result, meaning there may be an extra sample at * partition boundaries. */ +@Since("1.0.0") @Experimental -class BinaryClassificationMetrics( - val scoreAndLabels: RDD[(Double, Double)], - val numBins: Int) extends Logging { +class BinaryClassificationMetrics @Since("1.3.0") ( + @Since("1.3.0") val scoreAndLabels: RDD[(Double, Double)], + @Since("1.3.0") val numBins: Int) extends Logging { require(numBins >= 0, "numBins must be nonnegative") /** * Defaults `numBins` to 0. */ + @Since("1.0.0") def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0) /** @@ -61,12 +63,18 @@ class BinaryClassificationMetrics( private[mllib] def this(scoreAndLabels: DataFrame) = this(scoreAndLabels.map(r => (r.getDouble(0), r.getDouble(1)))) - /** Unpersist intermediate RDDs used in the computation. */ + /** + * Unpersist intermediate RDDs used in the computation. + */ + @Since("1.0.0") def unpersist() { cumulativeCounts.unpersist() } - /** Returns thresholds in descending order. */ + /** + * Returns thresholds in descending order. + */ + @Since("1.0.0") def thresholds(): RDD[Double] = cumulativeCounts.map(_._1) /** @@ -75,6 +83,7 @@ class BinaryClassificationMetrics( * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic */ + @Since("1.0.0") def roc(): RDD[(Double, Double)] = { val rocCurve = createCurve(FalsePositiveRate, Recall) val sc = confusions.context @@ -86,6 +95,7 @@ class BinaryClassificationMetrics( /** * Computes the area under the receiver operating characteristic (ROC) curve. */ + @Since("1.0.0") def areaUnderROC(): Double = AreaUnderCurve.of(roc()) /** @@ -93,6 +103,7 @@ class BinaryClassificationMetrics( * NOT (precision, recall), with (0.0, 1.0) prepended to it. * @see http://en.wikipedia.org/wiki/Precision_and_recall */ + @Since("1.0.0") def pr(): RDD[(Double, Double)] = { val prCurve = createCurve(Recall, Precision) val sc = confusions.context @@ -103,6 +114,7 @@ class BinaryClassificationMetrics( /** * Computes the area under the precision-recall curve. */ + @Since("1.0.0") def areaUnderPR(): Double = AreaUnderCurve.of(pr()) /** @@ -111,15 +123,25 @@ class BinaryClassificationMetrics( * @return an RDD of (threshold, F-Measure) pairs. * @see http://en.wikipedia.org/wiki/F1_score */ + @Since("1.0.0") def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta)) - /** Returns the (threshold, F-Measure) curve with beta = 1.0. */ + /** + * Returns the (threshold, F-Measure) curve with beta = 1.0. + */ + @Since("1.0.0") def fMeasureByThreshold(): RDD[(Double, Double)] = fMeasureByThreshold(1.0) - /** Returns the (threshold, precision) curve. */ + /** + * Returns the (threshold, precision) curve. + */ + @Since("1.0.0") def precisionByThreshold(): RDD[(Double, Double)] = createCurve(Precision) - /** Returns the (threshold, recall) curve. */ + /** + * Returns the (threshold, recall) curve. + */ + @Since("1.0.0") def recallByThreshold(): RDD[(Double, Double)] = createCurve(Recall) private lazy val ( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala index 4628dc569091..00e837661dfc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.evaluation import scala.collection.Map import org.apache.spark.SparkContext._ -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{Matrices, Matrix} import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame @@ -31,8 +31,9 @@ import org.apache.spark.sql.DataFrame * * @param predictionAndLabels an RDD of (prediction, label) pairs. */ +@Since("1.1.0") @Experimental -class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { +class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Double)]) { /** * An auxiliary constructor taking a DataFrame. @@ -65,6 +66,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * they are ordered by class label ascending, * as in "labels" */ + @Since("1.1.0") def confusionMatrix: Matrix = { val n = labels.size val values = Array.ofDim[Double](n * n) @@ -84,12 +86,14 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns true positive rate for a given label (category) * @param label the label. */ + @Since("1.1.0") def truePositiveRate(label: Double): Double = recall(label) /** * Returns false positive rate for a given label (category) * @param label the label. */ + @Since("1.1.0") def falsePositiveRate(label: Double): Double = { val fp = fpByClass.getOrElse(label, 0) fp.toDouble / (labelCount - labelCountByClass(label)) @@ -99,6 +103,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns precision for a given label (category) * @param label the label. */ + @Since("1.1.0") def precision(label: Double): Double = { val tp = tpByClass(label) val fp = fpByClass.getOrElse(label, 0) @@ -109,6 +114,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns recall for a given label (category) * @param label the label. */ + @Since("1.1.0") def recall(label: Double): Double = tpByClass(label).toDouble / labelCountByClass(label) /** @@ -116,6 +122,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * @param label the label. * @param beta the beta parameter. */ + @Since("1.1.0") def fMeasure(label: Double, beta: Double): Double = { val p = precision(label) val r = recall(label) @@ -127,11 +134,13 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns f1-measure for a given label (category) * @param label the label. */ + @Since("1.1.0") def fMeasure(label: Double): Double = fMeasure(label, 1.0) /** * Returns precision */ + @Since("1.1.0") lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount /** @@ -140,23 +149,27 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * because sum of all false positives is equal to sum * of all false negatives) */ + @Since("1.1.0") lazy val recall: Double = precision /** * Returns f-measure * (equals to precision and recall because precision equals recall) */ + @Since("1.1.0") lazy val fMeasure: Double = precision /** * Returns weighted true positive rate * (equals to precision, recall and f-measure) */ + @Since("1.1.0") lazy val weightedTruePositiveRate: Double = weightedRecall /** * Returns weighted false positive rate */ + @Since("1.1.0") lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case (category, count) => falsePositiveRate(category) * count.toDouble / labelCount }.sum @@ -165,6 +178,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns weighted averaged recall * (equals to precision, recall and f-measure) */ + @Since("1.1.0") lazy val weightedRecall: Double = labelCountByClass.map { case (category, count) => recall(category) * count.toDouble / labelCount }.sum @@ -172,6 +186,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns weighted averaged precision */ + @Since("1.1.0") lazy val weightedPrecision: Double = labelCountByClass.map { case (category, count) => precision(category) * count.toDouble / labelCount }.sum @@ -180,6 +195,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns weighted averaged f-measure * @param beta the beta parameter. */ + @Since("1.1.0") def weightedFMeasure(beta: Double): Double = labelCountByClass.map { case (category, count) => fMeasure(category, beta) * count.toDouble / labelCount }.sum @@ -187,6 +203,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns weighted averaged f1-measure */ + @Since("1.1.0") lazy val weightedFMeasure: Double = labelCountByClass.map { case (category, count) => fMeasure(category, 1.0) * count.toDouble / labelCount }.sum @@ -194,5 +211,6 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns the sequence of labels in ascending order */ + @Since("1.1.0") lazy val labels: Array[Double] = tpByClass.keys.toArray.sorted } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala index bf6eb1d5bd2a..c100b3c9ec14 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala @@ -17,6 +17,7 @@ package org.apache.spark.mllib.evaluation +import org.apache.spark.annotation.Since import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import org.apache.spark.sql.DataFrame @@ -26,7 +27,8 @@ import org.apache.spark.sql.DataFrame * @param predictionAndLabels an RDD of (predictions, labels) pairs, * both are non-null Arrays, each with unique elements. */ -class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]) { +@Since("1.2.0") +class MultilabelMetrics @Since("1.2.0") (predictionAndLabels: RDD[(Array[Double], Array[Double])]) { /** * An auxiliary constructor taking a DataFrame. @@ -44,6 +46,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] * Returns subset accuracy * (for equal sets of labels) */ + @Since("1.2.0") lazy val subsetAccuracy: Double = predictionAndLabels.filter { case (predictions, labels) => predictions.deep == labels.deep }.count().toDouble / numDocs @@ -51,6 +54,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns accuracy */ + @Since("1.2.0") lazy val accuracy: Double = predictionAndLabels.map { case (predictions, labels) => labels.intersect(predictions).size.toDouble / (labels.size + predictions.size - labels.intersect(predictions).size)}.sum / numDocs @@ -59,6 +63,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns Hamming-loss */ + @Since("1.2.0") lazy val hammingLoss: Double = predictionAndLabels.map { case (predictions, labels) => labels.size + predictions.size - 2 * labels.intersect(predictions).size }.sum / (numDocs * numLabels) @@ -66,6 +71,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns document-based precision averaged by the number of documents */ + @Since("1.2.0") lazy val precision: Double = predictionAndLabels.map { case (predictions, labels) => if (predictions.size > 0) { predictions.intersect(labels).size.toDouble / predictions.size @@ -77,6 +83,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns document-based recall averaged by the number of documents */ + @Since("1.2.0") lazy val recall: Double = predictionAndLabels.map { case (predictions, labels) => labels.intersect(predictions).size.toDouble / labels.size }.sum / numDocs @@ -84,6 +91,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns document-based f1-measure averaged by the number of documents */ + @Since("1.2.0") lazy val f1Measure: Double = predictionAndLabels.map { case (predictions, labels) => 2.0 * predictions.intersect(labels).size / (predictions.size + labels.size) }.sum / numDocs @@ -104,6 +112,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] * Returns precision for a given label (category) * @param label the label. */ + @Since("1.2.0") def precision(label: Double): Double = { val tp = tpPerClass(label) val fp = fpPerClass.getOrElse(label, 0L) @@ -114,6 +123,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] * Returns recall for a given label (category) * @param label the label. */ + @Since("1.2.0") def recall(label: Double): Double = { val tp = tpPerClass(label) val fn = fnPerClass.getOrElse(label, 0L) @@ -124,6 +134,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] * Returns f1-measure for a given label (category) * @param label the label. */ + @Since("1.2.0") def f1Measure(label: Double): Double = { val p = precision(label) val r = recall(label) @@ -138,6 +149,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] * Returns micro-averaged label-based precision * (equals to micro-averaged document-based precision) */ + @Since("1.2.0") lazy val microPrecision: Double = { val sumFp = fpPerClass.foldLeft(0L){ case(cum, (_, fp)) => cum + fp} sumTp.toDouble / (sumTp + sumFp) @@ -147,6 +159,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] * Returns micro-averaged label-based recall * (equals to micro-averaged document-based recall) */ + @Since("1.2.0") lazy val microRecall: Double = { val sumFn = fnPerClass.foldLeft(0.0){ case(cum, (_, fn)) => cum + fn} sumTp.toDouble / (sumTp + sumFn) @@ -156,10 +169,12 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] * Returns micro-averaged label-based f1-measure * (equals to micro-averaged document-based f1-measure) */ + @Since("1.2.0") lazy val microF1Measure: Double = 2.0 * sumTp / (2 * sumTp + sumFnClass + sumFpClass) /** * Returns the sequence of labels in ascending order */ + @Since("1.2.0") lazy val labels: Array[Double] = tpPerClass.keys.toArray.sorted } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala index 5b5a2a1450f7..a7f43f0b110f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala @@ -23,7 +23,7 @@ import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.{JavaSparkContext, JavaRDD} import org.apache.spark.rdd.RDD @@ -35,6 +35,7 @@ import org.apache.spark.rdd.RDD * * @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs. */ +@Since("1.2.0") @Experimental class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]) extends Logging with Serializable { @@ -56,6 +57,7 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])] * @param k the position to compute the truncated precision, must be positive * @return the average precision at the first k ranking positions */ + @Since("1.2.0") def precisionAt(k: Int): Double = { require(k > 0, "ranking position k should be positive") predictionAndLabels.map { case (pred, lab) => @@ -125,6 +127,7 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])] * @param k the position to compute the truncated ndcg, must be positive * @return the average ndcg at the first k ranking positions */ + @Since("1.2.0") def ndcgAt(k: Int): Double = { require(k > 0, "ranking position k should be positive") predictionAndLabels.map { case (pred, lab) => @@ -163,6 +166,7 @@ object RankingMetrics { * Creates a [[RankingMetrics]] instance (for Java users). * @param predictionAndLabels a JavaRDD of (predicted ranking, ground truth set) pairs */ + @Since("1.4.0") def of[E, T <: jl.Iterable[E]](predictionAndLabels: JavaRDD[(T, T)]): RankingMetrics[E] = { implicit val tag = JavaSparkContext.fakeClassTag[E] val rdd = predictionAndLabels.rdd.map { case (predictions, labels) => diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala index 408847afa800..799ebb980ef0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.evaluation -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.rdd.RDD import org.apache.spark.Logging import org.apache.spark.mllib.linalg.Vectors @@ -30,8 +30,10 @@ import org.apache.spark.sql.DataFrame * * @param predictionAndObservations an RDD of (prediction, observation) pairs. */ +@Since("1.2.0") @Experimental -class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging { +class RegressionMetrics @Since("1.2.0") ( + predictionAndObservations: RDD[(Double, Double)]) extends Logging { /** * An auxiliary constructor taking a DataFrame. @@ -67,6 +69,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend * explainedVariance = \sum_i (\hat{y_i} - \bar{y})^2 / n * @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]] */ + @Since("1.2.0") def explainedVariance: Double = { SSreg / summary.count } @@ -75,6 +78,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend * Returns the mean absolute error, which is a risk function corresponding to the * expected value of the absolute error loss or l1-norm loss. */ + @Since("1.2.0") def meanAbsoluteError: Double = { summary.normL1(1) / summary.count } @@ -83,6 +87,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend * Returns the mean squared error, which is a risk function corresponding to the * expected value of the squared error loss or quadratic loss. */ + @Since("1.2.0") def meanSquaredError: Double = { SSerr / summary.count } @@ -91,6 +96,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend * Returns the root mean squared error, which is defined as the square root of * the mean squared error. */ + @Since("1.2.0") def rootMeanSquaredError: Double = { math.sqrt(this.meanSquaredError) } @@ -99,6 +105,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend * Returns R^2^, the unadjusted coefficient of determination. * @see [[http://en.wikipedia.org/wiki/Coefficient_of_determination]] */ + @Since("1.2.0") def r2: Double = { 1 - SSerr / SStot } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 5f8c1dea237b..4743cfd1a2c3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.feature import scala.collection.mutable.ArrayBuilder -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics @@ -31,8 +31,10 @@ import org.apache.spark.rdd.RDD * * @param selectedFeatures list of indices to select (filter). Must be ordered asc */ +@Since("1.3.0") @Experimental -class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransformer { +class ChiSqSelectorModel @Since("1.3.0") ( + @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer { require(isSorted(selectedFeatures), "Array has to be sorted asc") @@ -52,6 +54,7 @@ class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransf * @param vector vector to be transformed. * @return transformed vector. */ + @Since("1.3.0") override def transform(vector: Vector): Vector = { compress(vector, selectedFeatures) } @@ -107,8 +110,10 @@ class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransf * @param numTopFeatures number of features that selector will select * (ordered by statistic value descending) */ +@Since("1.3.0") @Experimental -class ChiSqSelector (val numTopFeatures: Int) extends Serializable { +class ChiSqSelector @Since("1.3.0") ( + @Since("1.3.0") val numTopFeatures: Int) extends Serializable { /** * Returns a ChiSquared feature selector. @@ -117,6 +122,7 @@ class ChiSqSelector (val numTopFeatures: Int) extends Serializable { * Real-valued features will be treated as categorical for each distinct value. * Apply feature discretizer before using this function. */ + @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { val indices = Statistics.chiSqTest(data) .zipWithIndex.sortBy { case (res, _) => -res.statistic } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala index d67fe6c3ee4f..d0a6cf61687a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.feature -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg._ /** @@ -27,8 +27,10 @@ import org.apache.spark.mllib.linalg._ * multiplier. * @param scalingVec The values used to scale the reference vector's individual components. */ +@Since("1.4.0") @Experimental -class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer { +class ElementwiseProduct @Since("1.4.0") ( + @Since("1.4.0") val scalingVec: Vector) extends VectorTransformer { /** * Does the hadamard product transformation. @@ -36,6 +38,7 @@ class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer { * @param vector vector to be transformed. * @return transformed vector. */ + @Since("1.4.0") override def transform(vector: Vector): Vector = { require(vector.size == scalingVec.size, s"vector sizes do not match: Expected ${scalingVec.size} but found ${vector.size}") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala index c53475818395..e47d524b6162 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala @@ -22,7 +22,7 @@ import java.lang.{Iterable => JavaIterable} import scala.collection.JavaConverters._ import scala.collection.mutable -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD @@ -34,19 +34,25 @@ import org.apache.spark.util.Utils * * @param numFeatures number of features (default: 2^20^) */ +@Since("1.1.0") @Experimental class HashingTF(val numFeatures: Int) extends Serializable { + /** + */ + @Since("1.1.0") def this() = this(1 << 20) /** * Returns the index of the input term. */ + @Since("1.1.0") def indexOf(term: Any): Int = Utils.nonNegativeMod(term.##, numFeatures) /** * Transforms the input document into a sparse term frequency vector. */ + @Since("1.1.0") def transform(document: Iterable[_]): Vector = { val termFrequencies = mutable.HashMap.empty[Int, Double] document.foreach { term => @@ -59,6 +65,7 @@ class HashingTF(val numFeatures: Int) extends Serializable { /** * Transforms the input document into a sparse term frequency vector (Java version). */ + @Since("1.1.0") def transform(document: JavaIterable[_]): Vector = { transform(document.asScala) } @@ -66,6 +73,7 @@ class HashingTF(val numFeatures: Int) extends Serializable { /** * Transforms the input document to term frequency vectors. */ + @Since("1.1.0") def transform[D <: Iterable[_]](dataset: RDD[D]): RDD[Vector] = { dataset.map(this.transform) } @@ -73,6 +81,7 @@ class HashingTF(val numFeatures: Int) extends Serializable { /** * Transforms the input document to term frequency vectors (Java version). */ + @Since("1.1.0") def transform[D <: JavaIterable[_]](dataset: JavaRDD[D]): JavaRDD[Vector] = { dataset.rdd.map(this.transform).toJavaRDD() } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 3fab7ea79bef..68078ccfa3d6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.feature import breeze.linalg.{DenseVector => BDV} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.rdd.RDD @@ -37,9 +37,11 @@ import org.apache.spark.rdd.RDD * @param minDocFreq minimum of documents in which a term * should appear for filtering */ +@Since("1.1.0") @Experimental -class IDF(val minDocFreq: Int) { +class IDF @Since("1.2.0") (@Since("1.2.0") val minDocFreq: Int) { + @Since("1.1.0") def this() = this(0) // TODO: Allow different IDF formulations. @@ -48,6 +50,7 @@ class IDF(val minDocFreq: Int) { * Computes the inverse document frequency. * @param dataset an RDD of term frequency vectors */ + @Since("1.1.0") def fit(dataset: RDD[Vector]): IDFModel = { val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator( minDocFreq = minDocFreq))( @@ -61,6 +64,7 @@ class IDF(val minDocFreq: Int) { * Computes the inverse document frequency. * @param dataset a JavaRDD of term frequency vectors */ + @Since("1.1.0") def fit(dataset: JavaRDD[Vector]): IDFModel = { fit(dataset.rdd) } @@ -159,7 +163,8 @@ private object IDF { * Represents an IDF model that can transform term frequency vectors. */ @Experimental -class IDFModel private[spark] (val idf: Vector) extends Serializable { +@Since("1.1.0") +class IDFModel private[spark] (@Since("1.1.0") val idf: Vector) extends Serializable { /** * Transforms term frequency (TF) vectors to TF-IDF vectors. @@ -171,6 +176,7 @@ class IDFModel private[spark] (val idf: Vector) extends Serializable { * @param dataset an RDD of term frequency vectors * @return an RDD of TF-IDF vectors */ + @Since("1.1.0") def transform(dataset: RDD[Vector]): RDD[Vector] = { val bcIdf = dataset.context.broadcast(idf) dataset.mapPartitions(iter => iter.map(v => IDFModel.transform(bcIdf.value, v))) @@ -182,6 +188,7 @@ class IDFModel private[spark] (val idf: Vector) extends Serializable { * @param v a term frequency vector * @return a TF-IDF vector */ + @Since("1.3.0") def transform(v: Vector): Vector = IDFModel.transform(idf, v) /** @@ -189,6 +196,7 @@ class IDFModel private[spark] (val idf: Vector) extends Serializable { * @param dataset a JavaRDD of term frequency vectors * @return a JavaRDD of TF-IDF vectors */ + @Since("1.1.0") def transform(dataset: JavaRDD[Vector]): JavaRDD[Vector] = { transform(dataset.rdd).toJavaRDD() } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala index 32848e039eb8..8d5a22520d6b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.feature -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} /** @@ -31,9 +31,11 @@ import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors * * @param p Normalization in L^p^ space, p = 2 by default. */ +@Since("1.1.0") @Experimental -class Normalizer(p: Double) extends VectorTransformer { +class Normalizer @Since("1.1.0") (p: Double) extends VectorTransformer { + @Since("1.1.0") def this() = this(2) require(p >= 1.0) @@ -44,6 +46,7 @@ class Normalizer(p: Double) extends VectorTransformer { * @param vector vector to be normalized. * @return normalized vector. If the norm of the input is zero, it will return the input vector. */ + @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala index 2a66263d8b7d..ecb3c1e6c1c8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala @@ -17,6 +17,7 @@ package org.apache.spark.mllib.feature +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.linalg.distributed.RowMatrix @@ -27,7 +28,8 @@ import org.apache.spark.rdd.RDD * * @param k number of principal components */ -class PCA(val k: Int) { +@Since("1.4.0") +class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) { require(k >= 1, s"PCA requires a number of principal components k >= 1 but was given $k") /** @@ -35,6 +37,7 @@ class PCA(val k: Int) { * * @param sources source vectors */ + @Since("1.4.0") def fit(sources: RDD[Vector]): PCAModel = { require(k <= sources.first().size, s"source vector size is ${sources.first().size} must be greater than k=$k") @@ -58,7 +61,10 @@ class PCA(val k: Int) { new PCAModel(k, pc) } - /** Java-friendly version of [[fit()]] */ + /** + * Java-friendly version of [[fit()]] + */ + @Since("1.4.0") def fit(sources: JavaRDD[Vector]): PCAModel = fit(sources.rdd) } @@ -68,7 +74,10 @@ class PCA(val k: Int) { * @param k number of principal components. * @param pc a principal components Matrix. Each column is one principal component. */ -class PCAModel private[spark] (val k: Int, val pc: DenseMatrix) extends VectorTransformer { +@Since("1.4.0") +class PCAModel private[spark] ( + @Since("1.4.0") val k: Int, + @Since("1.4.0") val pc: DenseMatrix) extends VectorTransformer { /** * Transform a vector by computed Principal Components. * @@ -76,6 +85,7 @@ class PCAModel private[spark] (val k: Int, val pc: DenseMatrix) extends VectorTr * Vector must be the same length as the source vectors given to [[PCA.fit()]]. * @return transformed vector. Vector will be of length k. */ + @Since("1.4.0") override def transform(vector: Vector): Vector = { vector match { case dv: DenseVector => diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala index c73b8f258060..f018b453bae7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala @@ -18,7 +18,7 @@ package org.apache.spark.mllib.feature import org.apache.spark.Logging -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.rdd.RDD @@ -32,9 +32,11 @@ import org.apache.spark.rdd.RDD * dense output, so this does not work on sparse input and will raise an exception. * @param withStd True by default. Scales the data to unit standard deviation. */ +@Since("1.1.0") @Experimental -class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging { +class StandardScaler @Since("1.1.0") (withMean: Boolean, withStd: Boolean) extends Logging { + @Since("1.1.0") def this() = this(false, true) if (!(withMean || withStd)) { @@ -47,6 +49,7 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging { * @param data The data used to compute the mean and variance to build the transformation model. * @return a StandardScalarModel */ + @Since("1.1.0") def fit(data: RDD[Vector]): StandardScalerModel = { // TODO: skip computation if both withMean and withStd are false val summary = data.treeAggregate(new MultivariateOnlineSummarizer)( @@ -69,13 +72,17 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging { * @param withStd whether to scale the data to have unit standard deviation * @param withMean whether to center the data before scaling */ +@Since("1.1.0") @Experimental -class StandardScalerModel ( - val std: Vector, - val mean: Vector, - var withStd: Boolean, - var withMean: Boolean) extends VectorTransformer { +class StandardScalerModel @Since("1.3.0") ( + @Since("1.3.0") val std: Vector, + @Since("1.1.0") val mean: Vector, + @Since("1.3.0") var withStd: Boolean, + @Since("1.3.0") var withMean: Boolean) extends VectorTransformer { + /** + */ + @Since("1.3.0") def this(std: Vector, mean: Vector) { this(std, mean, withStd = std != null, withMean = mean != null) require(this.withStd || this.withMean, @@ -86,8 +93,10 @@ class StandardScalerModel ( } } + @Since("1.3.0") def this(std: Vector) = this(std, null) + @Since("1.3.0") @DeveloperApi def setWithMean(withMean: Boolean): this.type = { require(!(withMean && this.mean == null), "cannot set withMean to true while mean is null") @@ -95,6 +104,7 @@ class StandardScalerModel ( this } + @Since("1.3.0") @DeveloperApi def setWithStd(withStd: Boolean): this.type = { require(!(withStd && this.std == null), @@ -115,6 +125,7 @@ class StandardScalerModel ( * @return Standardized vector. If the std of a column is zero, it will return default `0.0` * for the column with zero std. */ + @Since("1.1.0") override def transform(vector: Vector): Vector = { require(mean.size == vector.size) if (withMean) { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala index 7358c1c84f79..5778fd1d0925 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.feature -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD @@ -26,6 +26,7 @@ import org.apache.spark.rdd.RDD * :: DeveloperApi :: * Trait for transformation of a vector */ +@Since("1.1.0") @DeveloperApi trait VectorTransformer extends Serializable { @@ -35,6 +36,7 @@ trait VectorTransformer extends Serializable { * @param vector vector to be transformed. * @return transformed vector. */ + @Since("1.1.0") def transform(vector: Vector): Vector /** @@ -43,6 +45,7 @@ trait VectorTransformer extends Serializable { * @param data RDD[Vector] to be transformed. * @return transformed RDD[Vector]. */ + @Since("1.1.0") def transform(data: RDD[Vector]): RDD[Vector] = { // Later in #1498 , all RDD objects are sent via broadcasting instead of akka. // So it should be no longer necessary to explicitly broadcast `this` object. @@ -55,6 +58,7 @@ trait VectorTransformer extends Serializable { * @param data JavaRDD[Vector] to be transformed. * @return transformed JavaRDD[Vector]. */ + @Since("1.1.0") def transform(data: JavaRDD[Vector]): JavaRDD[Vector] = { transform(data.rdd) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index cbbd2b0c8d06..36b124c5d296 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -32,7 +32,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.Logging import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors, DenseMatrix, BLAS, DenseVector} import org.apache.spark.mllib.util.{Loader, Saveable} @@ -70,6 +70,7 @@ private case class VocabWord( * and * Distributed Representations of Words and Phrases and their Compositionality. */ +@Since("1.1.0") @Experimental class Word2Vec extends Serializable with Logging { @@ -83,6 +84,7 @@ class Word2Vec extends Serializable with Logging { /** * Sets vector size (default: 100). */ + @Since("1.1.0") def setVectorSize(vectorSize: Int): this.type = { this.vectorSize = vectorSize this @@ -91,6 +93,7 @@ class Word2Vec extends Serializable with Logging { /** * Sets initial learning rate (default: 0.025). */ + @Since("1.1.0") def setLearningRate(learningRate: Double): this.type = { this.learningRate = learningRate this @@ -99,6 +102,7 @@ class Word2Vec extends Serializable with Logging { /** * Sets number of partitions (default: 1). Use a small number for accuracy. */ + @Since("1.1.0") def setNumPartitions(numPartitions: Int): this.type = { require(numPartitions > 0, s"numPartitions must be greater than 0 but got $numPartitions") this.numPartitions = numPartitions @@ -109,6 +113,7 @@ class Word2Vec extends Serializable with Logging { * Sets number of iterations (default: 1), which should be smaller than or equal to number of * partitions. */ + @Since("1.1.0") def setNumIterations(numIterations: Int): this.type = { this.numIterations = numIterations this @@ -117,6 +122,7 @@ class Word2Vec extends Serializable with Logging { /** * Sets random seed (default: a random long integer). */ + @Since("1.1.0") def setSeed(seed: Long): this.type = { this.seed = seed this @@ -126,6 +132,7 @@ class Word2Vec extends Serializable with Logging { * Sets minCount, the minimum number of times a token must appear to be included in the word2vec * model's vocabulary (default: 5). */ + @Since("1.3.0") def setMinCount(minCount: Int): this.type = { this.minCount = minCount this @@ -263,6 +270,7 @@ class Word2Vec extends Serializable with Logging { * @param dataset an RDD of words * @return a Word2VecModel */ + @Since("1.1.0") def fit[S <: Iterable[String]](dataset: RDD[S]): Word2VecModel = { val words = dataset.flatMap(x => x) @@ -412,6 +420,7 @@ class Word2Vec extends Serializable with Logging { * @param dataset a JavaRDD of words * @return a Word2VecModel */ + @Since("1.1.0") def fit[S <: JavaIterable[String]](dataset: JavaRDD[S]): Word2VecModel = { fit(dataset.rdd.map(_.asScala)) } @@ -427,6 +436,7 @@ class Word2Vec extends Serializable with Logging { * (i * vectorSize, i * vectorSize + vectorSize) */ @Experimental +@Since("1.1.0") class Word2VecModel private[mllib] ( private val wordIndex: Map[String, Int], private val wordVectors: Array[Float]) extends Serializable with Saveable { @@ -454,6 +464,7 @@ class Word2VecModel private[mllib] ( wordVecNorms } + @Since("1.5.0") def this(model: Map[String, Array[Float]]) = { this(Word2VecModel.buildWordIndex(model), Word2VecModel.buildWordVectors(model)) } @@ -469,6 +480,7 @@ class Word2VecModel private[mllib] ( override protected def formatVersion = "1.0" + @Since("1.4.0") def save(sc: SparkContext, path: String): Unit = { Word2VecModel.SaveLoadV1_0.save(sc, path, getVectors) } @@ -478,6 +490,7 @@ class Word2VecModel private[mllib] ( * @param word a word * @return vector representation of word */ + @Since("1.1.0") def transform(word: String): Vector = { wordIndex.get(word) match { case Some(ind) => @@ -494,6 +507,7 @@ class Word2VecModel private[mllib] ( * @param num number of synonyms to find * @return array of (word, cosineSimilarity) */ + @Since("1.1.0") def findSynonyms(word: String, num: Int): Array[(String, Double)] = { val vector = transform(word) findSynonyms(vector, num) @@ -505,6 +519,7 @@ class Word2VecModel private[mllib] ( * @param num number of synonyms to find * @return array of (word, cosineSimilarity) */ + @Since("1.1.0") def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = { require(num > 0, "Number of similar words should > 0") // TODO: optimize top-k @@ -534,6 +549,7 @@ class Word2VecModel private[mllib] ( /** * Returns a map of words to their vector representations. */ + @Since("1.2.0") def getVectors: Map[String, Array[Float]] = { wordIndex.map { case (word, ind) => (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize)) @@ -541,6 +557,7 @@ class Word2VecModel private[mllib] ( } } +@Since("1.4.0") @Experimental object Word2VecModel extends Loader[Word2VecModel] { @@ -600,6 +617,7 @@ object Word2VecModel extends Loader[Word2VecModel] { } } + @Since("1.4.0") override def load(sc: SparkContext, path: String): Word2VecModel = { val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index 72d0ea0c12e1..95c688c86a7e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -16,10 +16,11 @@ */ package org.apache.spark.mllib.fpm +import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.JavaSparkContext.fakeClassTag import org.apache.spark.mllib.fpm.AssociationRules.Rule @@ -32,24 +33,22 @@ import org.apache.spark.rdd.RDD * Generates association rules from a [[RDD[FreqItemset[Item]]]. This method only generates * association rules which have a single item as the consequent. * - * @since 1.5.0 */ +@Since("1.5.0") @Experimental class AssociationRules private[fpm] ( private var minConfidence: Double) extends Logging with Serializable { /** * Constructs a default instance with default parameters {minConfidence = 0.8}. - * - * @since 1.5.0 */ + @Since("1.5.0") def this() = this(0.8) /** * Sets the minimal confidence (default: `0.8`). - * - * @since 1.5.0 */ + @Since("1.5.0") def setMinConfidence(minConfidence: Double): this.type = { require(minConfidence >= 0.0 && minConfidence <= 1.0) this.minConfidence = minConfidence @@ -61,8 +60,8 @@ class AssociationRules private[fpm] ( * @param freqItemsets frequent itemset model obtained from [[FPGrowth]] * @return a [[Set[Rule[Item]]] containing the assocation rules. * - * @since 1.5.0 */ + @Since("1.5.0") def run[Item: ClassTag](freqItemsets: RDD[FreqItemset[Item]]): RDD[Rule[Item]] = { // For candidate rule X => Y, generate (X, (Y, freq(X union Y))) val candidates = freqItemsets.flatMap { itemset => @@ -83,31 +82,41 @@ class AssociationRules private[fpm] ( }.filter(_.confidence >= minConfidence) } + /** Java-friendly version of [[run]]. */ + @Since("1.5.0") def run[Item](freqItemsets: JavaRDD[FreqItemset[Item]]): JavaRDD[Rule[Item]] = { val tag = fakeClassTag[Item] run(freqItemsets.rdd)(tag) } } +@Since("1.5.0") object AssociationRules { /** * :: Experimental :: * * An association rule between sets of items. - * @param antecedent hypotheses of the rule - * @param consequent conclusion of the rule + * @param antecedent hypotheses of the rule. Java users should call [[Rule#javaAntecedent]] + * instead. + * @param consequent conclusion of the rule. Java users should call [[Rule#javaConsequent]] + * instead. * @tparam Item item type * - * @since 1.5.0 */ + @Since("1.5.0") @Experimental class Rule[Item] private[fpm] ( - val antecedent: Array[Item], - val consequent: Array[Item], + @Since("1.5.0") val antecedent: Array[Item], + @Since("1.5.0") val consequent: Array[Item], freqUnion: Double, freqAntecedent: Double) extends Serializable { + /** + * Returns the confidence of the rule. + * + */ + @Since("1.5.0") def confidence: Double = freqUnion.toDouble / freqAntecedent require(antecedent.toSet.intersect(consequent.toSet).isEmpty, { @@ -115,5 +124,23 @@ object AssociationRules { s"A valid association rule must have disjoint antecedent and " + s"consequent but ${sharedItems} is present in both." }) + + /** + * Returns antecedent in a Java List. + * + */ + @Since("1.5.0") + def javaAntecedent: java.util.List[Item] = { + antecedent.toList.asJava + } + + /** + * Returns consequent in a Java List. + * + */ + @Since("1.5.0") + def javaConsequent: java.util.List[Item] = { + consequent.toList.asJava + } } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala index e2370a52f493..aea5c4f8a8a7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala @@ -25,7 +25,7 @@ import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.{HashPartitioner, Logging, Partitioner, SparkException} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.JavaSparkContext.fakeClassTag import org.apache.spark.mllib.fpm.FPGrowth._ @@ -39,15 +39,16 @@ import org.apache.spark.storage.StorageLevel * @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]] * @tparam Item item type * - * @since 1.3.0 */ +@Since("1.3.0") @Experimental -class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { +class FPGrowthModel[Item: ClassTag] @Since("1.3.0") ( + @Since("1.3.0") val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { /** * Generates association rules for the [[Item]]s in [[freqItemsets]]. * @param confidence minimal confidence of the rules produced - * @since 1.5.0 */ + @Since("1.5.0") def generateAssociationRules(confidence: Double): RDD[AssociationRules.Rule[Item]] = { val associationRules = new AssociationRules(confidence) associationRules.run(freqItemsets) @@ -71,8 +72,8 @@ class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) ex * @see [[http://en.wikipedia.org/wiki/Association_rule_learning Association rule learning * (Wikipedia)]] * - * @since 1.3.0 */ +@Since("1.3.0") @Experimental class FPGrowth private ( private var minSupport: Double, @@ -82,15 +83,15 @@ class FPGrowth private ( * Constructs a default instance with default parameters {minSupport: `0.3`, numPartitions: same * as the input data}. * - * @since 1.3.0 */ + @Since("1.3.0") def this() = this(0.3, -1) /** * Sets the minimal support level (default: `0.3`). * - * @since 1.3.0 */ + @Since("1.3.0") def setMinSupport(minSupport: Double): this.type = { this.minSupport = minSupport this @@ -99,8 +100,8 @@ class FPGrowth private ( /** * Sets the number of partitions used by parallel FP-growth (default: same as input data). * - * @since 1.3.0 */ + @Since("1.3.0") def setNumPartitions(numPartitions: Int): this.type = { this.numPartitions = numPartitions this @@ -111,8 +112,8 @@ class FPGrowth private ( * @param data input data set, each element contains a transaction * @return an [[FPGrowthModel]] * - * @since 1.3.0 */ + @Since("1.3.0") def run[Item: ClassTag](data: RDD[Array[Item]]): FPGrowthModel[Item] = { if (data.getStorageLevel == StorageLevel.NONE) { logWarning("Input data is not cached.") @@ -126,6 +127,8 @@ class FPGrowth private ( new FPGrowthModel(freqItemsets) } + /** Java-friendly version of [[run]]. */ + @Since("1.3.0") def run[Item, Basket <: JavaIterable[Item]](data: JavaRDD[Basket]): FPGrowthModel[Item] = { implicit val tag = fakeClassTag[Item] run(data.rdd.map(_.asScala.toArray)) @@ -213,8 +216,8 @@ class FPGrowth private ( /** * :: Experimental :: * - * @since 1.3.0 */ +@Since("1.3.0") @Experimental object FPGrowth { @@ -224,15 +227,17 @@ object FPGrowth { * @param freq frequency * @tparam Item item type * - * @since 1.3.0 */ - class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable { + @Since("1.3.0") + class FreqItemset[Item] @Since("1.3.0") ( + @Since("1.3.0") val items: Array[Item], + @Since("1.3.0") val freq: Long) extends Serializable { /** * Returns items in a Java List. * - * @since 1.3.0 */ + @Since("1.3.0") def javaItems: java.util.List[Item] = { items.toList.asJava } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala index ccebf951c850..3ea10779a183 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala @@ -22,85 +22,89 @@ import scala.collection.mutable import org.apache.spark.Logging /** - * Calculate all patterns of a projected database in local. + * Calculate all patterns of a projected database in local mode. + * + * @param minCount minimal count for a frequent pattern + * @param maxPatternLength max pattern length for a frequent pattern */ -private[fpm] object LocalPrefixSpan extends Logging with Serializable { - import PrefixSpan._ +private[fpm] class LocalPrefixSpan( + val minCount: Long, + val maxPatternLength: Int) extends Logging with Serializable { + import PrefixSpan.Postfix + import LocalPrefixSpan.ReversedPrefix + /** - * Calculate all patterns of a projected database. - * @param minCount minimum count - * @param maxPatternLength maximum pattern length - * @param prefixes prefixes in reversed order - * @param database the projected database - * @return a set of sequential pattern pairs, - * the key of pair is sequential pattern (a list of items in reversed order), - * the value of pair is the pattern's count. + * Generates frequent patterns on the input array of postfixes. + * @param postfixes an array of postfixes + * @return an iterator of (frequent pattern, count) */ - def run( - minCount: Long, - maxPatternLength: Int, - prefixes: List[Set[Int]], - database: Iterable[List[Set[Int]]]): Iterator[(List[Set[Int]], Long)] = { - if (prefixes.length == maxPatternLength || database.isEmpty) { - return Iterator.empty - } - val freqItemSetsAndCounts = getFreqItemAndCounts(minCount, database) - val freqItems = freqItemSetsAndCounts.keys.flatten.toSet - val filteredDatabase = database.map { suffix => - suffix - .map(item => freqItems.intersect(item)) - .filter(_.nonEmpty) - } - freqItemSetsAndCounts.iterator.flatMap { case (item, count) => - val newPrefixes = item :: prefixes - val newProjected = project(filteredDatabase, item) - Iterator.single((newPrefixes, count)) ++ - run(minCount, maxPatternLength, newPrefixes, newProjected) + def run(postfixes: Array[Postfix]): Iterator[(Array[Int], Long)] = { + genFreqPatterns(ReversedPrefix.empty, postfixes).map { case (prefix, count) => + (prefix.toSequence, count) } } /** - * Calculate suffix sequence immediately after the first occurrence of an item. - * @param item itemset to get suffix after - * @param sequence sequence to extract suffix from - * @return suffix sequence + * Recursively generates frequent patterns. + * @param prefix current prefix + * @param postfixes projected postfixes w.r.t. the prefix + * @return an iterator of (prefix, count) */ - def getSuffix(item: Set[Int], sequence: List[Set[Int]]): List[Set[Int]] = { - val itemsetSeq = sequence - val index = itemsetSeq.indexWhere(item.subsetOf(_)) - if (index == -1) { - List() - } else { - itemsetSeq.drop(index + 1) + private def genFreqPatterns( + prefix: ReversedPrefix, + postfixes: Array[Postfix]): Iterator[(ReversedPrefix, Long)] = { + if (maxPatternLength == prefix.length || postfixes.length < minCount) { + return Iterator.empty + } + // find frequent items + val counts = mutable.Map.empty[Int, Long].withDefaultValue(0) + postfixes.foreach { postfix => + postfix.genPrefixItems.foreach { case (x, _) => + counts(x) += 1L + } + } + val freqItems = counts.toSeq.filter { case (_, count) => + count >= minCount + }.sorted + // project and recursively call genFreqPatterns + freqItems.toIterator.flatMap { case (item, count) => + val newPrefix = prefix :+ item + Iterator.single((newPrefix, count)) ++ { + val projected = postfixes.map(_.project(item)).filter(_.nonEmpty) + genFreqPatterns(newPrefix, projected) + } } } +} - def project( - database: Iterable[List[Set[Int]]], - prefix: Set[Int]): Iterable[List[Set[Int]]] = { - database - .map(getSuffix(prefix, _)) - .filter(_.nonEmpty) - } +private object LocalPrefixSpan { /** - * Generates frequent items by filtering the input data using minimal count level. - * @param minCount the minimum count for an item to be frequent - * @param database database of sequences - * @return freq item to count map + * Represents a prefix stored as a list in reversed order. + * @param items items in the prefix in reversed order + * @param length length of the prefix, not counting delimiters */ - private def getFreqItemAndCounts( - minCount: Long, - database: Iterable[List[Set[Int]]]): Map[Set[Int], Long] = { - // TODO: use PrimitiveKeyOpenHashMap - val counts = mutable.Map[Set[Int], Long]().withDefaultValue(0L) - database.foreach { sequence => - sequence.flatMap(nonemptySubsets(_)).distinct.foreach { item => - counts(item) += 1L + class ReversedPrefix private (val items: List[Int], val length: Int) extends Serializable { + /** + * Expands the prefix by one item. + */ + def :+(item: Int): ReversedPrefix = { + require(item != 0) + if (item < 0) { + new ReversedPrefix(-item :: items, length + 1) + } else { + new ReversedPrefix(item :: 0 :: items, length + 1) } } - counts - .filter { case (_, count) => count >= minCount } - .toMap + + /** + * Converts this prefix to a sequence. + */ + def toSequence: Array[Int] = (0 :: items).toArray.reverse + } + + object ReversedPrefix { + /** An empty prefix. */ + val empty: ReversedPrefix = new ReversedPrefix(List.empty, 0) } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index 9eaf733fada2..97916daa2e9a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -18,64 +18,67 @@ package org.apache.spark.mllib.fpm import java.{lang => jl, util => ju} +import java.util.concurrent.atomic.AtomicInteger +import scala.collection.mutable import scala.collection.JavaConverters._ -import scala.collection.mutable.ArrayBuilder import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.JavaSparkContext.fakeClassTag import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel /** - * * :: Experimental :: * - * A parallel PrefixSpan algorithm to mine sequential pattern. - * The PrefixSpan algorithm is described in - * [[http://doi.org/10.1109/ICDE.2001.914830]]. + * A parallel PrefixSpan algorithm to mine frequent sequential patterns. + * The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns + * Efficiently by Prefix-Projected Pattern Growth ([[http://doi.org/10.1109/ICDE.2001.914830]]). * * @param minSupport the minimal support level of the sequential pattern, any pattern appears * more than (minSupport * size-of-the-dataset) times will be output * @param maxPatternLength the maximal length of the sequential pattern, any pattern appears - * less than maxPatternLength will be output + * less than maxPatternLength will be output + * @param maxLocalProjDBSize The maximum number of items (including delimiters used in the internal + * storage format) allowed in a projected database before local + * processing. If a projected database exceeds this size, another + * iteration of distributed prefix growth is run. * * @see [[https://en.wikipedia.org/wiki/Sequential_Pattern_Mining Sequential Pattern Mining * (Wikipedia)]] */ @Experimental +@Since("1.5.0") class PrefixSpan private ( private var minSupport: Double, - private var maxPatternLength: Int) extends Logging with Serializable { + private var maxPatternLength: Int, + private var maxLocalProjDBSize: Long) extends Logging with Serializable { import PrefixSpan._ - /** - * The maximum number of items allowed in a projected database before local processing. If a - * projected database exceeds this size, another iteration of distributed PrefixSpan is run. - */ - // TODO: make configurable with a better default value - private val maxLocalProjDBSize: Long = 32000000L - /** * Constructs a default instance with default parameters - * {minSupport: `0.1`, maxPatternLength: `10`}. + * {minSupport: `0.1`, maxPatternLength: `10`, maxLocalProjDBSize: `32000000L`}. */ - def this() = this(0.1, 10) + @Since("1.5.0") + def this() = this(0.1, 10, 32000000L) /** * Get the minimal support (i.e. the frequency of occurrence before a pattern is considered * frequent). */ - def getMinSupport: Double = this.minSupport + @Since("1.5.0") + def getMinSupport: Double = minSupport /** * Sets the minimal support level (default: `0.1`). */ + @Since("1.5.0") def setMinSupport(minSupport: Double): this.type = { - require(minSupport >= 0 && minSupport <= 1, "The minimum support value must be in [0, 1].") + require(minSupport >= 0 && minSupport <= 1, + s"The minimum support value must be in [0, 1], but got $minSupport.") this.minSupport = minSupport this } @@ -83,45 +86,120 @@ class PrefixSpan private ( /** * Gets the maximal pattern length (i.e. the length of the longest sequential pattern to consider. */ - def getMaxPatternLength: Double = this.maxPatternLength + @Since("1.5.0") + def getMaxPatternLength: Int = maxPatternLength /** * Sets maximal pattern length (default: `10`). */ + @Since("1.5.0") def setMaxPatternLength(maxPatternLength: Int): this.type = { // TODO: support unbounded pattern length when maxPatternLength = 0 - require(maxPatternLength >= 1, "The maximum pattern length value must be greater than 0.") + require(maxPatternLength >= 1, + s"The maximum pattern length value must be greater than 0, but got $maxPatternLength.") this.maxPatternLength = maxPatternLength this } /** - * Find the complete set of sequential patterns in the input sequences of itemsets. - * @param data ordered sequences of itemsets. - * @return a [[PrefixSpanModel]] that contains the frequent sequences + * Gets the maximum number of items allowed in a projected database before local processing. + */ + @Since("1.5.0") + def getMaxLocalProjDBSize: Long = maxLocalProjDBSize + + /** + * Sets the maximum number of items (including delimiters used in the internal storage format) + * allowed in a projected database before local processing (default: `32000000L`). + */ + @Since("1.5.0") + def setMaxLocalProjDBSize(maxLocalProjDBSize: Long): this.type = { + require(maxLocalProjDBSize >= 0L, + s"The maximum local projected database size must be nonnegative, but got $maxLocalProjDBSize") + this.maxLocalProjDBSize = maxLocalProjDBSize + this + } + + /** + * Finds the complete set of frequent sequential patterns in the input sequences of itemsets. + * @param data sequences of itemsets. + * @return a [[PrefixSpanModel]] that contains the frequent patterns */ + @Since("1.5.0") def run[Item: ClassTag](data: RDD[Array[Array[Item]]]): PrefixSpanModel[Item] = { - val itemToInt = data.aggregate(Set[Item]())( - seqOp = { (uniqItems, item) => uniqItems ++ item.flatten.toSet }, - combOp = { _ ++ _ } - ).zipWithIndex.toMap - val intToItem = Map() ++ (itemToInt.map { case (k, v) => (v, k) }) - - val dataInternalRepr = data.map { seq => - seq.map(itemset => itemset.map(itemToInt)).reduce((a, b) => a ++ (DELIMITER +: b)) + if (data.getStorageLevel == StorageLevel.NONE) { + logWarning("Input data is not cached.") } - val results = run(dataInternalRepr) - def toPublicRepr(pattern: Iterable[Int]): List[Array[Item]] = { - pattern.span(_ != DELIMITER) match { - case (x, xs) if xs.size > 1 => x.map(intToItem).toArray :: toPublicRepr(xs.tail) - case (x, xs) => List(x.map(intToItem).toArray) + val totalCount = data.count() + logInfo(s"number of sequences: $totalCount") + val minCount = math.ceil(minSupport * totalCount).toLong + logInfo(s"minimum count for a frequent pattern: $minCount") + + // Find frequent items. + val freqItemAndCounts = data.flatMap { itemsets => + val uniqItems = mutable.Set.empty[Item] + itemsets.foreach { _.foreach { item => + uniqItems += item + }} + uniqItems.toIterator.map((_, 1L)) + }.reduceByKey(_ + _) + .filter { case (_, count) => + count >= minCount + }.collect() + val freqItems = freqItemAndCounts.sortBy(-_._2).map(_._1) + logInfo(s"number of frequent items: ${freqItems.length}") + + // Keep only frequent items from input sequences and convert them to internal storage. + val itemToInt = freqItems.zipWithIndex.toMap + val dataInternalRepr = data.flatMap { itemsets => + val allItems = mutable.ArrayBuilder.make[Int] + var containsFreqItems = false + allItems += 0 + itemsets.foreach { itemsets => + val items = mutable.ArrayBuilder.make[Int] + itemsets.foreach { item => + if (itemToInt.contains(item)) { + items += itemToInt(item) + 1 // using 1-indexing in internal format + } + } + val result = items.result() + if (result.nonEmpty) { + containsFreqItems = true + allItems ++= result.sorted + } + allItems += 0 + } + if (containsFreqItems) { + Iterator.single(allItems.result()) + } else { + Iterator.empty + } + }.persist(StorageLevel.MEMORY_AND_DISK) + + val results = genFreqPatterns(dataInternalRepr, minCount, maxPatternLength, maxLocalProjDBSize) + + def toPublicRepr(pattern: Array[Int]): Array[Array[Item]] = { + val sequenceBuilder = mutable.ArrayBuilder.make[Array[Item]] + val itemsetBuilder = mutable.ArrayBuilder.make[Item] + val n = pattern.length + var i = 1 + while (i < n) { + val x = pattern(i) + if (x == 0) { + sequenceBuilder += itemsetBuilder.result() + itemsetBuilder.clear() + } else { + itemsetBuilder += freqItems(x - 1) // using 1-indexing in internal format + } + i += 1 } + sequenceBuilder.result() } + val freqSequences = results.map { case (seq: Array[Int], count: Long) => - new FreqSequence[Item](toPublicRepr(seq).toArray, count) + new FreqSequence(toPublicRepr(seq), count) } - new PrefixSpanModel[Item](freqSequences) + new PrefixSpanModel(freqSequences) } /** @@ -131,208 +209,335 @@ class PrefixSpan private ( * @tparam Item item type * @tparam Itemset itemset type, which is an Iterable of Items * @tparam Sequence sequence type, which is an Iterable of Itemsets - * @return a [[PrefixSpanModel]] that contains the frequent sequences + * @return a [[PrefixSpanModel]] that contains the frequent sequential patterns */ + @Since("1.5.0") def run[Item, Itemset <: jl.Iterable[Item], Sequence <: jl.Iterable[Itemset]]( data: JavaRDD[Sequence]): PrefixSpanModel[Item] = { implicit val tag = fakeClassTag[Item] run(data.rdd.map(_.asScala.map(_.asScala.toArray).toArray)) } +} + +@Experimental +@Since("1.5.0") +object PrefixSpan extends Logging { + /** - * Find the complete set of sequential patterns in the input sequences. This method utilizes - * the internal representation of itemsets as Array[Int] where each itemset is represented by - * a contiguous sequence of non-negative integers and delimiters represented by [[DELIMITER]]. - * @param data ordered sequences of itemsets. Items are represented by non-negative integers. - * Each itemset has one or more items and is delimited by [[DELIMITER]]. - * @return a set of sequential pattern pairs, - * the key of pair is pattern (a list of elements), - * the value of pair is the pattern's count. + * Find the complete set of frequent sequential patterns in the input sequences. + * @param data ordered sequences of itemsets. We represent a sequence internally as Array[Int], + * where each itemset is represented by a contiguous sequence of distinct and ordered + * positive integers. We use 0 as the delimiter at itemset boundaries, including the + * first and the last position. + * @return an RDD of (frequent sequential pattern, count) pairs, + * @see [[Postfix]] */ - private[fpm] def run(data: RDD[Array[Int]]): RDD[(Array[Int], Long)] = { + private[fpm] def genFreqPatterns( + data: RDD[Array[Int]], + minCount: Long, + maxPatternLength: Int, + maxLocalProjDBSize: Long): RDD[(Array[Int], Long)] = { val sc = data.sparkContext if (data.getStorageLevel == StorageLevel.NONE) { logWarning("Input data is not cached.") } - // Use List[Set[Item]] for internal computation - val sequences = data.map { seq => splitSequence(seq.toList) } - - // Convert min support to a min number of transactions for this dataset - val minCount = if (minSupport == 0) 0L else math.ceil(sequences.count() * minSupport).toLong - - // (Frequent items -> number of occurrences, all items here satisfy the `minSupport` threshold - val freqItemCounts = sequences - .flatMap(seq => seq.flatMap(nonemptySubsets(_)).distinct.map(item => (item, 1L))) - .reduceByKey(_ + _) - .filter { case (item, count) => (count >= minCount) } - .collect() - .toMap - - // Pairs of (length 1 prefix, suffix consisting of frequent items) - val itemSuffixPairs = { - val freqItemSets = freqItemCounts.keys.toSet - val freqItems = freqItemSets.flatten - sequences.flatMap { seq => - val filteredSeq = seq.map(item => freqItems.intersect(item)).filter(_.nonEmpty) - freqItemSets.flatMap { item => - val candidateSuffix = LocalPrefixSpan.getSuffix(item, filteredSeq) - candidateSuffix match { - case suffix if !suffix.isEmpty => Some((List(item), suffix)) - case _ => None + val postfixes = data.map(items => new Postfix(items)) + + // Local frequent patterns (prefixes) and their counts. + val localFreqPatterns = mutable.ArrayBuffer.empty[(Array[Int], Long)] + // Prefixes whose projected databases are small. + val smallPrefixes = mutable.Map.empty[Int, Prefix] + val emptyPrefix = Prefix.empty + // Prefixes whose projected databases are large. + var largePrefixes = mutable.Map(emptyPrefix.id -> emptyPrefix) + while (largePrefixes.nonEmpty) { + val numLocalFreqPatterns = localFreqPatterns.length + logInfo(s"number of local frequent patterns: $numLocalFreqPatterns") + if (numLocalFreqPatterns > 1000000) { + logWarning( + s""" + | Collected $numLocalFreqPatterns local frequent patterns. You may want to consider: + | 1. increase minSupport, + | 2. decrease maxPatternLength, + | 3. increase maxLocalProjDBSize. + """.stripMargin) + } + logInfo(s"number of small prefixes: ${smallPrefixes.size}") + logInfo(s"number of large prefixes: ${largePrefixes.size}") + val largePrefixArray = largePrefixes.values.toArray + val freqPrefixes = postfixes.flatMap { postfix => + largePrefixArray.flatMap { prefix => + postfix.project(prefix).genPrefixItems.map { case (item, postfixSize) => + ((prefix.id, item), (1L, postfixSize)) + } + } + }.reduceByKey { case ((c0, s0), (c1, s1)) => + (c0 + c1, s0 + s1) + }.filter { case (_, (c, _)) => c >= minCount } + .collect() + val newLargePrefixes = mutable.Map.empty[Int, Prefix] + freqPrefixes.foreach { case ((id, item), (count, projDBSize)) => + val newPrefix = largePrefixes(id) :+ item + localFreqPatterns += ((newPrefix.items :+ 0, count)) + if (newPrefix.length < maxPatternLength) { + if (projDBSize > maxLocalProjDBSize) { + newLargePrefixes += newPrefix.id -> newPrefix + } else { + smallPrefixes += newPrefix.id -> newPrefix } } } + largePrefixes = newLargePrefixes } - // Accumulator for the computed results to be returned, initialized to the frequent items (i.e. - // frequent length-one prefixes) - var resultsAccumulator = freqItemCounts.map { case (item, count) => (List(item), count) }.toList - - // Remaining work to be locally and distributively processed respectfully - var (pairsForLocal, pairsForDistributed) = partitionByProjDBSize(itemSuffixPairs) - - // Continue processing until no pairs for distributed processing remain (i.e. all prefixes have - // projected database sizes <= `maxLocalProjDBSize`) or `maxPatternLength` is reached - var patternLength = 1 - while (pairsForDistributed.count() != 0 && patternLength < maxPatternLength) { - val (nextPatternAndCounts, nextPrefixSuffixPairs) = - extendPrefixes(minCount, pairsForDistributed) - pairsForDistributed.unpersist() - val (smallerPairsPart, largerPairsPart) = partitionByProjDBSize(nextPrefixSuffixPairs) - pairsForDistributed = largerPairsPart - pairsForDistributed.persist(StorageLevel.MEMORY_AND_DISK) - pairsForLocal ++= smallerPairsPart - resultsAccumulator ++= nextPatternAndCounts.collect() - patternLength += 1 // pattern length grows one per iteration + var freqPatterns = sc.parallelize(localFreqPatterns, 1) + + val numSmallPrefixes = smallPrefixes.size + logInfo(s"number of small prefixes for local processing: $numSmallPrefixes") + if (numSmallPrefixes > 0) { + // Switch to local processing. + val bcSmallPrefixes = sc.broadcast(smallPrefixes) + val distributedFreqPattern = postfixes.flatMap { postfix => + bcSmallPrefixes.value.values.map { prefix => + (prefix.id, postfix.project(prefix).compressed) + }.filter(_._2.nonEmpty) + }.groupByKey().flatMap { case (id, projPostfixes) => + val prefix = bcSmallPrefixes.value(id) + val localPrefixSpan = new LocalPrefixSpan(minCount, maxPatternLength - prefix.length) + // TODO: We collect projected postfixes into memory. We should also compare the performance + // TODO: of keeping them on shuffle files. + localPrefixSpan.run(projPostfixes.toArray).map { case (pattern, count) => + (prefix.items ++ pattern, count) + } + } + // Union local frequent patterns and distributed ones. + freqPatterns = freqPatterns ++ distributedFreqPattern } - // Process the small projected databases locally - val remainingResults = getPatternsInLocal( - minCount, sc.parallelize(pairsForLocal, 1).groupByKey()) - - (sc.parallelize(resultsAccumulator, 1) ++ remainingResults) - .map { case (pattern, count) => (flattenSequence(pattern.reverse).toArray, count) } + freqPatterns } - /** - * Partitions the prefix-suffix pairs by projected database size. - * @param prefixSuffixPairs prefix (length n) and suffix pairs, - * @return prefix-suffix pairs partitioned by whether their projected database size is <= or - * greater than [[maxLocalProjDBSize]] + * Represents a prefix. + * @param items items in this prefix, using the internal format + * @param length length of this prefix, not counting 0 */ - private def partitionByProjDBSize(prefixSuffixPairs: RDD[(List[Set[Int]], List[Set[Int]])]) - : (List[(List[Set[Int]], List[Set[Int]])], RDD[(List[Set[Int]], List[Set[Int]])]) = { - val prefixToSuffixSize = prefixSuffixPairs - .aggregateByKey(0)( - seqOp = { case (count, suffix) => count + suffix.length }, - combOp = { _ + _ }) - val smallPrefixes = prefixToSuffixSize - .filter(_._2 <= maxLocalProjDBSize) - .keys - .collect() - .toSet - val small = prefixSuffixPairs.filter { case (prefix, _) => smallPrefixes.contains(prefix) } - val large = prefixSuffixPairs.filter { case (prefix, _) => !smallPrefixes.contains(prefix) } - (small.collect().toList, large) + private[fpm] class Prefix private (val items: Array[Int], val length: Int) extends Serializable { + + /** A unique id for this prefix. */ + val id: Int = Prefix.nextId + + /** Expands this prefix by the input item. */ + def :+(item: Int): Prefix = { + require(item != 0) + if (item < 0) { + new Prefix(items :+ -item, length + 1) + } else { + new Prefix(items ++ Array(0, item), length + 1) + } + } } - /** - * Extends all prefixes by one itemset from their suffix and computes the resulting frequent - * prefixes and remaining work. - * @param minCount minimum count - * @param prefixSuffixPairs prefix (length N) and suffix pairs, - * @return (frequent length N+1 extended prefix, count) pairs and (frequent length N+1 extended - * prefix, corresponding suffix) pairs. - */ - private def extendPrefixes( - minCount: Long, - prefixSuffixPairs: RDD[(List[Set[Int]], List[Set[Int]])]) - : (RDD[(List[Set[Int]], Long)], RDD[(List[Set[Int]], List[Set[Int]])]) = { - - // (length N prefix, itemset from suffix) pairs and their corresponding number of occurrences - // Every (prefix :+ suffix) is guaranteed to have support exceeding `minSupport` - val prefixItemPairAndCounts = prefixSuffixPairs - .flatMap { case (prefix, suffix) => - suffix.flatMap(nonemptySubsets(_)).distinct.map(y => ((prefix, y), 1L)) } - .reduceByKey(_ + _) - .filter { case (item, count) => (count >= minCount) } - - // Map from prefix to set of possible next items from suffix - val prefixToNextItems = prefixItemPairAndCounts - .keys - .groupByKey() - .mapValues(_.toSet) - .collect() - .toMap - - // Frequent patterns with length N+1 and their corresponding counts - val extendedPrefixAndCounts = prefixItemPairAndCounts - .map { case ((prefix, item), count) => (item :: prefix, count) } - - // Remaining work, all prefixes will have length N+1 - val extendedPrefixAndSuffix = prefixSuffixPairs - .filter(x => prefixToNextItems.contains(x._1)) - .flatMap { case (prefix, suffix) => - val frequentNextItemSets = prefixToNextItems(prefix) - val frequentNextItems = frequentNextItemSets.flatten - val filteredSuffix = suffix - .map(item => frequentNextItems.intersect(item)) - .filter(_.nonEmpty) - frequentNextItemSets.flatMap { item => - LocalPrefixSpan.getSuffix(item, filteredSuffix) match { - case suffix if !suffix.isEmpty => Some(item :: prefix, suffix) - case _ => None - } - } - } + private[fpm] object Prefix { + /** Internal counter to generate unique IDs. */ + private val counter: AtomicInteger = new AtomicInteger(-1) - (extendedPrefixAndCounts, extendedPrefixAndSuffix) + /** Gets the next unique ID. */ + private def nextId: Int = counter.incrementAndGet() + + /** An empty [[Prefix]] instance. */ + val empty: Prefix = new Prefix(Array.empty, 0) } /** - * Calculate the patterns in local. - * @param minCount the absolute minimum count - * @param data prefixes and projected sequences data data - * @return patterns + * An internal representation of a postfix from some projection. + * We use one int array to store the items, which might also contains other items from the + * original sequence. + * Items are represented by positive integers, and items in each itemset must be distinct and + * ordered. + * we use 0 as the delimiter between itemsets. + * For example, a sequence `<(12)(31)1>` is represented by `[0, 1, 2, 0, 1, 3, 0, 1, 0]`. + * The postfix of this sequence w.r.t. to prefix `<1>` is `<(_2)(13)1>`. + * We may reuse the original items array `[0, 1, 2, 0, 1, 3, 0, 1, 0]` to represent the postfix, + * and mark the start index of the postfix, which is `2` in this example. + * So the active items in this postfix are `[2, 0, 1, 3, 0, 1, 0]`. + * We also remember the start indices of partial projections, the ones that split an itemset. + * For example, another possible partial projection w.r.t. `<1>` is `<(_3)1>`. + * We remember the start indices of partial projections, which is `[2, 5]` in this example. + * This data structure makes it easier to do projections. + * + * @param items a sequence stored as `Array[Int]` containing this postfix + * @param start the start index of this postfix in items + * @param partialStarts start indices of possible partial projections, strictly increasing */ - private def getPatternsInLocal( - minCount: Long, - data: RDD[(List[Set[Int]], Iterable[List[Set[Int]]])]): RDD[(List[Set[Int]], Long)] = { - data.flatMap { - case (prefix, projDB) => LocalPrefixSpan.run(minCount, maxPatternLength, prefix, projDB) + private[fpm] class Postfix( + val items: Array[Int], + val start: Int = 0, + val partialStarts: Array[Int] = Array.empty) extends Serializable { + + require(items.last == 0, s"The last item in a postfix must be zero, but got ${items.last}.") + if (partialStarts.nonEmpty) { + require(partialStarts.head >= start, + "The first partial start cannot be smaller than the start index," + + s"but got partialStarts.head = ${partialStarts.head} < start = $start.") } - } -} + /** + * Start index of the first full itemset contained in this postfix. + */ + private[this] def fullStart: Int = { + var i = start + while (items(i) != 0) { + i += 1 + } + i + } + + /** + * Generates length-1 prefix items of this postfix with the corresponding postfix sizes. + * There are two types of prefix items: + * a) The item can be assembled to the last itemset of the prefix. For example, + * the postfix of `<(12)(123)>1` w.r.t. `<1>` is `<(_2)(123)1>`. The prefix items of this + * postfix can be assembled to `<1>` is `_2` and `_3`, resulting new prefixes `<(12)>` and + * `<(13)>`. We flip the sign in the output to indicate that this is a partial prefix item. + * b) The item can be appended to the prefix. Taking the same example above, the prefix items + * can be appended to `<1>` is `1`, `2`, and `3`, resulting new prefixes `<11>`, `<12>`, + * and `<13>`. + * @return an iterator of (prefix item, corresponding postfix size). If the item is negative, it + * indicates a partial prefix item, which should be assembled to the last itemset of the + * current prefix. Otherwise, the item should be appended to the current prefix. + */ + def genPrefixItems: Iterator[(Int, Long)] = { + val n1 = items.length - 1 + // For each unique item (subject to sign) in this sequence, we output exact one split. + // TODO: use PrimitiveKeyOpenHashMap + val prefixes = mutable.Map.empty[Int, Long] + // a) items that can be assembled to the last itemset of the prefix + partialStarts.foreach { start => + var i = start + var x = -items(i) + while (x != 0) { + if (!prefixes.contains(x)) { + prefixes(x) = n1 - i + } + i += 1 + x = -items(i) + } + } + // b) items that can be appended to the prefix + var i = fullStart + while (i < n1) { + val x = items(i) + if (x != 0 && !prefixes.contains(x)) { + prefixes(x) = n1 - i + } + i += 1 + } + prefixes.toIterator + } -object PrefixSpan { - private[fpm] val DELIMITER = -1 + /** Tests whether this postfix is non-empty. */ + def nonEmpty: Boolean = items.length > start + 1 - /** Splits an array of itemsets delimited by [[DELIMITER]]. */ - private[fpm] def splitSequence(sequence: List[Int]): List[Set[Int]] = { - sequence.span(_ != DELIMITER) match { - case (x, xs) if xs.length > 1 => x.toSet :: splitSequence(xs.tail) - case (x, xs) => List(x.toSet) + /** + * Projects this postfix with respect to the input prefix item. + * @param prefix prefix item. If prefix is positive, we match items in any full itemset; if it + * is negative, we do partial projections. + * @return the projected postfix + */ + def project(prefix: Int): Postfix = { + require(prefix != 0) + val n1 = items.length - 1 + var matched = false + var newStart = n1 + val newPartialStarts = mutable.ArrayBuilder.make[Int] + if (prefix < 0) { + // Search for partial projections. + val target = -prefix + partialStarts.foreach { start => + var i = start + var x = items(i) + while (x != target && x != 0) { + i += 1 + x = items(i) + } + if (x == target) { + i += 1 + if (!matched) { + newStart = i + matched = true + } + if (items(i) != 0) { + newPartialStarts += i + } + } + } + } else { + // Search for items in full itemsets. + // Though the items are ordered in each itemsets, they should be small in practice. + // So a sequential scan is sufficient here, compared to bisection search. + val target = prefix + var i = fullStart + while (i < n1) { + val x = items(i) + if (x == target) { + if (!matched) { + newStart = i + matched = true + } + if (items(i + 1) != 0) { + newPartialStarts += i + 1 + } + } + i += 1 + } + } + new Postfix(items, newStart, newPartialStarts.result()) } - } - /** Flattens a sequence of itemsets into an Array, inserting[[DELIMITER]] between itemsets. */ - private[fpm] def flattenSequence(sequence: List[Set[Int]]): List[Int] = { - val builder = ArrayBuilder.make[Int]() - for (itemSet <- sequence) { - builder += DELIMITER - builder ++= itemSet.toSeq.sorted + /** + * Projects this postfix with respect to the input prefix. + */ + private def project(prefix: Array[Int]): Postfix = { + var partial = true + var cur = this + var i = 0 + val np = prefix.length + while (i < np && cur.nonEmpty) { + val x = prefix(i) + if (x == 0) { + partial = false + } else { + if (partial) { + cur = cur.project(-x) + } else { + cur = cur.project(x) + partial = true + } + } + i += 1 + } + cur } - builder.result().toList.drop(1) // drop trailing delimiter - } - /** Returns an iterator over all non-empty subsets of `itemSet` */ - private[fpm] def nonemptySubsets(itemSet: Set[Int]): Iterator[Set[Int]] = { - // TODO: improve complexity by using partial prefixes, considering one item at a time - itemSet.subsets.filter(_ != Set.empty[Int]) + /** + * Projects this postfix with respect to the input prefix. + */ + def project(prefix: Prefix): Postfix = project(prefix.items) + + /** + * Returns the same sequence with compressed storage if possible. + */ + def compressed: Postfix = { + if (start > 0) { + new Postfix(items.slice(start, items.length), 0, partialStarts.map(_ - start)) + } else { + this + } + } } /** @@ -341,10 +546,14 @@ object PrefixSpan { * @param freq frequency * @tparam Item item type */ - class FreqSequence[Item](val sequence: Array[Array[Item]], val freq: Long) extends Serializable { + @Since("1.5.0") + class FreqSequence[Item] @Since("1.5.0") ( + @Since("1.5.0") val sequence: Array[Array[Item]], + @Since("1.5.0") val freq: Long) extends Serializable { /** * Returns sequence as a Java List of lists for Java users. */ + @Since("1.5.0") def javaSequence: ju.List[ju.List[Item]] = sequence.map(_.toList.asJava).toList.asJava } } @@ -354,5 +563,7 @@ object PrefixSpan { * @param freqSequences frequent sequences * @tparam Item item type */ -class PrefixSpanModel[Item](val freqSequences: RDD[PrefixSpan.FreqSequence[Item]]) +@Since("1.5.0") +class PrefixSpanModel[Item] @Since("1.5.0") ( + @Since("1.5.0") val freqSequences: RDD[PrefixSpan.FreqSequence[Item]]) extends Serializable diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala index 9029093e0fa0..ab475af264dd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala @@ -305,6 +305,8 @@ private[spark] object BLAS extends Serializable with Logging { "The matrix C cannot be the product of a transpose() call. C.isTransposed must be false.") if (alpha == 0.0 && beta == 1.0) { logDebug("gemm: alpha is equal to 0 and beta is equal to 1. Returning C.") + } else if (alpha == 0.0) { + f2jBLAS.dscal(C.values.length, beta, C.values, 1) } else { A match { case sparse: SparseMatrix => gemm(alpha, sparse, B, beta, C) @@ -408,8 +410,8 @@ private[spark] object BLAS extends Serializable with Logging { } } } else { - // Scale matrix first if `beta` is not equal to 0.0 - if (beta != 0.0) { + // Scale matrix first if `beta` is not equal to 1.0 + if (beta != 1.0) { f2jBLAS.dscal(C.values.length, beta, C.values, 1) } // Perform matrix multiplication and add to C. The rows of A are multiplied by the columns of @@ -469,9 +471,11 @@ private[spark] object BLAS extends Serializable with Logging { require(A.numCols == x.size, s"The columns of A don't match the number of elements of x. A: ${A.numCols}, x: ${x.size}") require(A.numRows == y.size, - s"The rows of A don't match the number of elements of y. A: ${A.numRows}, y:${y.size}}") - if (alpha == 0.0) { - logDebug("gemv: alpha is equal to 0. Returning y.") + s"The rows of A don't match the number of elements of y. A: ${A.numRows}, y:${y.size}") + if (alpha == 0.0 && beta == 1.0) { + logDebug("gemv: alpha is equal to 0 and beta is equal to 1. Returning y.") + } else if (alpha == 0.0) { + scal(beta, y) } else { (A, x) match { case (smA: SparseMatrix, dvx: DenseVector) => @@ -526,11 +530,6 @@ private[spark] object BLAS extends Serializable with Logging { val xValues = x.values val yValues = y.values - if (alpha == 0.0) { - scal(beta, y) - return - } - if (A.isTransposed) { var rowCounterForA = 0 while (rowCounterForA < mA) { @@ -581,11 +580,6 @@ private[spark] object BLAS extends Serializable with Logging { val Arows = if (!A.isTransposed) A.rowIndices else A.colPtrs val Acols = if (!A.isTransposed) A.colPtrs else A.rowIndices - if (alpha == 0.0) { - scal(beta, y) - return - } - if (A.isTransposed) { var rowCounter = 0 while (rowCounter < mA) { @@ -604,7 +598,7 @@ private[spark] object BLAS extends Serializable with Logging { rowCounter += 1 } } else { - scal(beta, y) + if (beta != 1.0) scal(beta, y) var colCounterForA = 0 var k = 0 @@ -659,7 +653,7 @@ private[spark] object BLAS extends Serializable with Logging { rowCounter += 1 } } else { - scal(beta, y) + if (beta != 1.0) scal(beta, y) // Perform matrix-vector multiplication and add to y var colCounterForA = 0 while (colCounterForA < nA) { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 1c858348bf20..c02ba426fcc3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -23,7 +23,7 @@ import scala.collection.mutable.{ArrayBuilder => MArrayBuilder, HashSet => MHash import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, Matrix => BM} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ @@ -32,18 +32,23 @@ import org.apache.spark.sql.types._ * Trait for a local matrix. */ @SQLUserDefinedType(udt = classOf[MatrixUDT]) +@Since("1.0.0") sealed trait Matrix extends Serializable { /** Number of rows. */ + @Since("1.0.0") def numRows: Int /** Number of columns. */ + @Since("1.0.0") def numCols: Int /** Flag that keeps track whether the matrix is transposed or not. False by default. */ + @Since("1.3.0") val isTransposed: Boolean = false /** Converts to a dense array in column major. */ + @Since("1.0.0") def toArray: Array[Double] = { val newArray = new Array[Double](numRows * numCols) foreachActive { (i, j, v) => @@ -56,6 +61,7 @@ sealed trait Matrix extends Serializable { private[mllib] def toBreeze: BM[Double] /** Gets the (i, j)-th element. */ + @Since("1.3.0") def apply(i: Int, j: Int): Double /** Return the index for the (i, j)-th element in the backing array. */ @@ -65,12 +71,15 @@ sealed trait Matrix extends Serializable { private[mllib] def update(i: Int, j: Int, v: Double): Unit /** Get a deep copy of the matrix. */ + @Since("1.2.0") def copy: Matrix /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */ + @Since("1.3.0") def transpose: Matrix /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */ + @Since("1.2.0") def multiply(y: DenseMatrix): DenseMatrix = { val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols) BLAS.gemm(1.0, this, y, 0.0, C) @@ -78,11 +87,13 @@ sealed trait Matrix extends Serializable { } /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */ + @Since("1.2.0") def multiply(y: DenseVector): DenseVector = { multiply(y.asInstanceOf[Vector]) } /** Convenience method for `Matrix`-`Vector` multiplication. */ + @Since("1.4.0") def multiply(y: Vector): DenseVector = { val output = new DenseVector(new Array[Double](numRows)) BLAS.gemv(1.0, this, y, 0.0, output) @@ -93,6 +104,7 @@ sealed trait Matrix extends Serializable { override def toString: String = toBreeze.toString() /** A human readable representation of the matrix with maximum lines and width */ + @Since("1.4.0") def toString(maxLines: Int, maxLineWidth: Int): String = toBreeze.toString(maxLines, maxLineWidth) /** Map the values of this matrix using a function. Generates a new matrix. Performs the @@ -118,11 +130,13 @@ sealed trait Matrix extends Serializable { /** * Find the number of non-zero active values. */ + @Since("1.5.0") def numNonzeros: Int /** * Find the number of values stored explicitly. These values can be zero as well. */ + @Since("1.5.0") def numActives: Int } @@ -228,12 +242,13 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] { * @param isTransposed whether the matrix is transposed. If true, `values` stores the matrix in * row major. */ +@Since("1.0.0") @SQLUserDefinedType(udt = classOf[MatrixUDT]) -class DenseMatrix( - val numRows: Int, - val numCols: Int, - val values: Array[Double], - override val isTransposed: Boolean) extends Matrix { +class DenseMatrix @Since("1.3.0") ( + @Since("1.0.0") val numRows: Int, + @Since("1.0.0") val numCols: Int, + @Since("1.0.0") val values: Array[Double], + @Since("1.3.0") override val isTransposed: Boolean) extends Matrix { require(values.length == numRows * numCols, "The number of values supplied doesn't match the " + s"size of the matrix! values.length: ${values.length}, numRows * numCols: ${numRows * numCols}") @@ -253,12 +268,12 @@ class DenseMatrix( * @param numCols number of columns * @param values matrix entries in column major */ + @Since("1.0.0") def this(numRows: Int, numCols: Int, values: Array[Double]) = this(numRows, numCols, values, false) override def equals(o: Any): Boolean = o match { - case m: DenseMatrix => - m.numRows == numRows && m.numCols == numCols && Arrays.equals(toArray, m.toArray) + case m: Matrix => toBreeze == m.toBreeze case _ => false } @@ -277,6 +292,7 @@ class DenseMatrix( private[mllib] def apply(i: Int): Double = values(i) + @Since("1.3.0") override def apply(i: Int, j: Int): Double = values(index(i, j)) private[mllib] def index(i: Int, j: Int): Int = { @@ -287,6 +303,7 @@ class DenseMatrix( values(index(i, j)) = v } + @Since("1.4.0") override def copy: DenseMatrix = new DenseMatrix(numRows, numCols, values.clone()) private[spark] def map(f: Double => Double) = new DenseMatrix(numRows, numCols, values.map(f), @@ -302,6 +319,7 @@ class DenseMatrix( this } + @Since("1.3.0") override def transpose: DenseMatrix = new DenseMatrix(numCols, numRows, values, !isTransposed) private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = { @@ -332,14 +350,17 @@ class DenseMatrix( } } + @Since("1.5.0") override def numNonzeros: Int = values.count(_ != 0) + @Since("1.5.0") override def numActives: Int = values.length /** * Generate a `SparseMatrix` from the given `DenseMatrix`. The new matrix will have isTransposed * set to false. */ + @Since("1.3.0") def toSparse: SparseMatrix = { val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble val colPtrs: Array[Int] = new Array[Int](numCols + 1) @@ -367,6 +388,7 @@ class DenseMatrix( /** * Factory methods for [[org.apache.spark.mllib.linalg.DenseMatrix]]. */ +@Since("1.3.0") object DenseMatrix { /** @@ -375,6 +397,7 @@ object DenseMatrix { * @param numCols number of columns of the matrix * @return `DenseMatrix` with size `numRows` x `numCols` and values of zeros */ + @Since("1.3.0") def zeros(numRows: Int, numCols: Int): DenseMatrix = { require(numRows.toLong * numCols <= Int.MaxValue, s"$numRows x $numCols dense matrix is too large to allocate") @@ -387,6 +410,7 @@ object DenseMatrix { * @param numCols number of columns of the matrix * @return `DenseMatrix` with size `numRows` x `numCols` and values of ones */ + @Since("1.3.0") def ones(numRows: Int, numCols: Int): DenseMatrix = { require(numRows.toLong * numCols <= Int.MaxValue, s"$numRows x $numCols dense matrix is too large to allocate") @@ -398,6 +422,7 @@ object DenseMatrix { * @param n number of rows and columns of the matrix * @return `DenseMatrix` with size `n` x `n` and values of ones on the diagonal */ + @Since("1.3.0") def eye(n: Int): DenseMatrix = { val identity = DenseMatrix.zeros(n, n) var i = 0 @@ -415,6 +440,7 @@ object DenseMatrix { * @param rng a random number generator * @return `DenseMatrix` with size `numRows` x `numCols` and values in U(0, 1) */ + @Since("1.3.0") def rand(numRows: Int, numCols: Int, rng: Random): DenseMatrix = { require(numRows.toLong * numCols <= Int.MaxValue, s"$numRows x $numCols dense matrix is too large to allocate") @@ -428,6 +454,7 @@ object DenseMatrix { * @param rng a random number generator * @return `DenseMatrix` with size `numRows` x `numCols` and values in N(0, 1) */ + @Since("1.3.0") def randn(numRows: Int, numCols: Int, rng: Random): DenseMatrix = { require(numRows.toLong * numCols <= Int.MaxValue, s"$numRows x $numCols dense matrix is too large to allocate") @@ -440,6 +467,7 @@ object DenseMatrix { * @return Square `DenseMatrix` with size `values.length` x `values.length` and `values` * on the diagonal */ + @Since("1.3.0") def diag(vector: Vector): DenseMatrix = { val n = vector.size val matrix = DenseMatrix.zeros(n, n) @@ -475,14 +503,15 @@ object DenseMatrix { * Compressed Sparse Row (CSR) format, where `colPtrs` behaves as rowPtrs, * and `rowIndices` behave as colIndices, and `values` are stored in row major. */ +@Since("1.2.0") @SQLUserDefinedType(udt = classOf[MatrixUDT]) -class SparseMatrix( - val numRows: Int, - val numCols: Int, - val colPtrs: Array[Int], - val rowIndices: Array[Int], - val values: Array[Double], - override val isTransposed: Boolean) extends Matrix { +class SparseMatrix @Since("1.3.0") ( + @Since("1.2.0") val numRows: Int, + @Since("1.2.0") val numCols: Int, + @Since("1.2.0") val colPtrs: Array[Int], + @Since("1.2.0") val rowIndices: Array[Int], + @Since("1.2.0") val values: Array[Double], + @Since("1.3.0") override val isTransposed: Boolean) extends Matrix { require(values.length == rowIndices.length, "The number of row indices and values don't match! " + s"values.length: ${values.length}, rowIndices.length: ${rowIndices.length}") @@ -512,6 +541,7 @@ class SparseMatrix( * order for each column * @param values non-zero matrix entries in column major */ + @Since("1.2.0") def this( numRows: Int, numCols: Int, @@ -519,6 +549,11 @@ class SparseMatrix( rowIndices: Array[Int], values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, values, false) + override def equals(o: Any): Boolean = o match { + case m: Matrix => toBreeze == m.toBreeze + case _ => false + } + private[mllib] def toBreeze: BM[Double] = { if (!isTransposed) { new BSM[Double](values, numRows, numCols, colPtrs, rowIndices) @@ -528,6 +563,7 @@ class SparseMatrix( } } + @Since("1.3.0") override def apply(i: Int, j: Int): Double = { val ind = index(i, j) if (ind < 0) 0.0 else values(ind) @@ -551,6 +587,7 @@ class SparseMatrix( } } + @Since("1.4.0") override def copy: SparseMatrix = { new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.clone()) } @@ -568,6 +605,7 @@ class SparseMatrix( this } + @Since("1.3.0") override def transpose: SparseMatrix = new SparseMatrix(numCols, numRows, colPtrs, rowIndices, values, !isTransposed) @@ -602,12 +640,15 @@ class SparseMatrix( * Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed * set to false. */ + @Since("1.3.0") def toDense: DenseMatrix = { new DenseMatrix(numRows, numCols, toArray) } + @Since("1.5.0") override def numNonzeros: Int = values.count(_ != 0) + @Since("1.5.0") override def numActives: Int = values.length } @@ -615,6 +656,7 @@ class SparseMatrix( /** * Factory methods for [[org.apache.spark.mllib.linalg.SparseMatrix]]. */ +@Since("1.3.0") object SparseMatrix { /** @@ -626,6 +668,7 @@ object SparseMatrix { * @param entries Array of (i, j, value) tuples * @return The corresponding `SparseMatrix` */ + @Since("1.3.0") def fromCOO(numRows: Int, numCols: Int, entries: Iterable[(Int, Int, Double)]): SparseMatrix = { val sortedEntries = entries.toSeq.sortBy(v => (v._2, v._1)) val numEntries = sortedEntries.size @@ -674,6 +717,7 @@ object SparseMatrix { * @param n number of rows and columns of the matrix * @return `SparseMatrix` with size `n` x `n` and values of ones on the diagonal */ + @Since("1.3.0") def speye(n: Int): SparseMatrix = { new SparseMatrix(n, n, (0 to n).toArray, (0 until n).toArray, Array.fill(n)(1.0)) } @@ -743,6 +787,7 @@ object SparseMatrix { * @param rng a random number generator * @return `SparseMatrix` with size `numRows` x `numCols` and values in U(0, 1) */ + @Since("1.3.0") def sprand(numRows: Int, numCols: Int, density: Double, rng: Random): SparseMatrix = { val mat = genRandMatrix(numRows, numCols, density, rng) mat.update(i => rng.nextDouble()) @@ -756,6 +801,7 @@ object SparseMatrix { * @param rng a random number generator * @return `SparseMatrix` with size `numRows` x `numCols` and values in N(0, 1) */ + @Since("1.3.0") def sprandn(numRows: Int, numCols: Int, density: Double, rng: Random): SparseMatrix = { val mat = genRandMatrix(numRows, numCols, density, rng) mat.update(i => rng.nextGaussian()) @@ -767,6 +813,7 @@ object SparseMatrix { * @return Square `SparseMatrix` with size `values.length` x `values.length` and non-zero * `values` on the diagonal */ + @Since("1.3.0") def spdiag(vector: Vector): SparseMatrix = { val n = vector.size vector match { @@ -783,6 +830,7 @@ object SparseMatrix { /** * Factory methods for [[org.apache.spark.mllib.linalg.Matrix]]. */ +@Since("1.0.0") object Matrices { /** @@ -792,6 +840,7 @@ object Matrices { * @param numCols number of columns * @param values matrix entries in column major */ + @Since("1.0.0") def dense(numRows: Int, numCols: Int, values: Array[Double]): Matrix = { new DenseMatrix(numRows, numCols, values) } @@ -805,6 +854,7 @@ object Matrices { * @param rowIndices the row index of the entry * @param values non-zero matrix entries in column major */ + @Since("1.2.0") def sparse( numRows: Int, numCols: Int, @@ -838,6 +888,7 @@ object Matrices { * @param numCols number of columns of the matrix * @return `Matrix` with size `numRows` x `numCols` and values of zeros */ + @Since("1.2.0") def zeros(numRows: Int, numCols: Int): Matrix = DenseMatrix.zeros(numRows, numCols) /** @@ -846,6 +897,7 @@ object Matrices { * @param numCols number of columns of the matrix * @return `Matrix` with size `numRows` x `numCols` and values of ones */ + @Since("1.2.0") def ones(numRows: Int, numCols: Int): Matrix = DenseMatrix.ones(numRows, numCols) /** @@ -853,6 +905,7 @@ object Matrices { * @param n number of rows and columns of the matrix * @return `Matrix` with size `n` x `n` and values of ones on the diagonal */ + @Since("1.2.0") def eye(n: Int): Matrix = DenseMatrix.eye(n) /** @@ -860,6 +913,7 @@ object Matrices { * @param n number of rows and columns of the matrix * @return `Matrix` with size `n` x `n` and values of ones on the diagonal */ + @Since("1.3.0") def speye(n: Int): Matrix = SparseMatrix.speye(n) /** @@ -869,6 +923,7 @@ object Matrices { * @param rng a random number generator * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1) */ + @Since("1.2.0") def rand(numRows: Int, numCols: Int, rng: Random): Matrix = DenseMatrix.rand(numRows, numCols, rng) @@ -880,6 +935,7 @@ object Matrices { * @param rng a random number generator * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1) */ + @Since("1.3.0") def sprand(numRows: Int, numCols: Int, density: Double, rng: Random): Matrix = SparseMatrix.sprand(numRows, numCols, density, rng) @@ -890,6 +946,7 @@ object Matrices { * @param rng a random number generator * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1) */ + @Since("1.2.0") def randn(numRows: Int, numCols: Int, rng: Random): Matrix = DenseMatrix.randn(numRows, numCols, rng) @@ -901,6 +958,7 @@ object Matrices { * @param rng a random number generator * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1) */ + @Since("1.3.0") def sprandn(numRows: Int, numCols: Int, density: Double, rng: Random): Matrix = SparseMatrix.sprandn(numRows, numCols, density, rng) @@ -910,6 +968,7 @@ object Matrices { * @return Square `Matrix` with size `values.length` x `values.length` and `values` * on the diagonal */ + @Since("1.2.0") def diag(vector: Vector): Matrix = DenseMatrix.diag(vector) /** @@ -919,6 +978,7 @@ object Matrices { * @param matrices array of matrices * @return a single `Matrix` composed of the matrices that were horizontally concatenated */ + @Since("1.3.0") def horzcat(matrices: Array[Matrix]): Matrix = { if (matrices.isEmpty) { return new DenseMatrix(0, 0, Array[Double]()) @@ -977,6 +1037,7 @@ object Matrices { * @param matrices array of matrices * @return a single `Matrix` composed of the matrices that were vertically concatenated */ + @Since("1.3.0") def vertcat(matrices: Array[Matrix]): Matrix = { if (matrices.isEmpty) { return new DenseMatrix(0, 0, Array[Double]()) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala index b416d50a5631..4dcf8f28f202 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala @@ -17,12 +17,13 @@ package org.apache.spark.mllib.linalg -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} /** * :: Experimental :: * Represents singular value decomposition (SVD) factors. */ +@Since("1.0.0") @Experimental case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VType) @@ -30,6 +31,7 @@ case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VTyp * :: Experimental :: * Represents QR factors. */ +@Since("1.5.0") @Experimental -case class QRDecomposition[UType, VType](Q: UType, R: VType) +case class QRDecomposition[QType, RType](Q: QType, R: RType) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 96d1f48ba2ba..06ebb1586990 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -26,7 +26,7 @@ import scala.collection.JavaConverters._ import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} import org.apache.spark.SparkException -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{AlphaComponent, Since} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericMutableRow @@ -38,16 +38,19 @@ import org.apache.spark.sql.types._ * Note: Users should not implement this interface. */ @SQLUserDefinedType(udt = classOf[VectorUDT]) +@Since("1.0.0") sealed trait Vector extends Serializable { /** * Size of the vector. */ + @Since("1.0.0") def size: Int /** * Converts the instance to a double array. */ + @Since("1.0.0") def toArray: Array[Double] override def equals(other: Any): Boolean = { @@ -99,11 +102,13 @@ sealed trait Vector extends Serializable { * Gets the value of the ith element. * @param i index */ + @Since("1.1.0") def apply(i: Int): Double = toBreeze(i) /** * Makes a deep copy of this vector. */ + @Since("1.1.0") def copy: Vector = { throw new NotImplementedError(s"copy is not implemented for ${this.getClass}.") } @@ -121,26 +126,31 @@ sealed trait Vector extends Serializable { * Number of active entries. An "active entry" is an element which is explicitly stored, * regardless of its value. Note that inactive entries have value 0. */ + @Since("1.4.0") def numActives: Int /** * Number of nonzero elements. This scans all active values and count nonzeros. */ + @Since("1.4.0") def numNonzeros: Int /** * Converts this vector to a sparse vector with all explicit zeros removed. */ + @Since("1.4.0") def toSparse: SparseVector /** * Converts this vector to a dense vector. */ + @Since("1.4.0") def toDense: DenseVector = new DenseVector(this.toArray) /** * Returns a vector in either dense or sparse format, whichever uses less storage. */ + @Since("1.4.0") def compressed: Vector = { val nnz = numNonzeros // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes. @@ -155,19 +165,18 @@ sealed trait Vector extends Serializable { * Find the index of a maximal element. Returns the first maximal element in case of a tie. * Returns -1 if vector has length 0. */ + @Since("1.5.0") def argmax: Int } /** - * :: DeveloperApi :: + * :: AlphaComponent :: * * User-defined type for [[Vector]] which allows easy interaction with SQL * via [[org.apache.spark.sql.DataFrame]]. - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. */ -@DeveloperApi -private[spark] class VectorUDT extends UserDefinedType[Vector] { +@AlphaComponent +class VectorUDT extends UserDefinedType[Vector] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense @@ -243,11 +252,13 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] { * We don't use the name `Vector` because Scala imports * [[scala.collection.immutable.Vector]] by default. */ +@Since("1.0.0") object Vectors { /** * Creates a dense vector from its values. */ + @Since("1.0.0") @varargs def dense(firstValue: Double, otherValues: Double*): Vector = new DenseVector((firstValue +: otherValues).toArray) @@ -256,6 +267,7 @@ object Vectors { /** * Creates a dense vector from a double array. */ + @Since("1.0.0") def dense(values: Array[Double]): Vector = new DenseVector(values) /** @@ -265,6 +277,7 @@ object Vectors { * @param indices index array, must be strictly increasing. * @param values value array, must have the same length as indices. */ + @Since("1.0.0") def sparse(size: Int, indices: Array[Int], values: Array[Double]): Vector = new SparseVector(size, indices, values) @@ -274,6 +287,7 @@ object Vectors { * @param size vector size. * @param elements vector elements in (index, value) pairs. */ + @Since("1.0.0") def sparse(size: Int, elements: Seq[(Int, Double)]): Vector = { require(size > 0, "The size of the requested sparse vector must be greater than 0.") @@ -295,6 +309,7 @@ object Vectors { * @param size vector size. * @param elements vector elements in (index, value) pairs. */ + @Since("1.0.0") def sparse(size: Int, elements: JavaIterable[(JavaInteger, JavaDouble)]): Vector = { sparse(size, elements.asScala.map { case (i, x) => (i.intValue(), x.doubleValue()) @@ -307,6 +322,7 @@ object Vectors { * @param size vector size * @return a zero vector */ + @Since("1.1.0") def zeros(size: Int): Vector = { new DenseVector(new Array[Double](size)) } @@ -314,6 +330,7 @@ object Vectors { /** * Parses a string resulted from [[Vector.toString]] into a [[Vector]]. */ + @Since("1.1.0") def parse(s: String): Vector = { parseNumeric(NumericParser.parse(s)) } @@ -357,6 +374,7 @@ object Vectors { * @param p norm. * @return norm in L^p^ space. */ + @Since("1.3.0") def norm(vector: Vector, p: Double): Double = { require(p >= 1.0, "To compute the p-norm of the vector, we require that you specify a p>=1. " + s"You specified p=$p.") @@ -409,6 +427,7 @@ object Vectors { * @param v2 second Vector. * @return squared distance between two Vectors. */ + @Since("1.3.0") def sqdist(v1: Vector, v2: Vector): Double = { require(v1.size == v2.size, s"Vector dimensions do not match: Dim(v1)=${v1.size} and Dim(v2)" + s"=${v2.size}.") @@ -522,19 +541,25 @@ object Vectors { /** * A dense vector represented by a value array. */ +@Since("1.0.0") @SQLUserDefinedType(udt = classOf[VectorUDT]) -class DenseVector(val values: Array[Double]) extends Vector { +class DenseVector @Since("1.0.0") ( + @Since("1.0.0") val values: Array[Double]) extends Vector { + @Since("1.0.0") override def size: Int = values.length override def toString: String = values.mkString("[", ",", "]") + @Since("1.0.0") override def toArray: Array[Double] = values private[spark] override def toBreeze: BV[Double] = new BDV[Double](values) + @Since("1.0.0") override def apply(i: Int): Double = values(i) + @Since("1.1.0") override def copy: DenseVector = { new DenseVector(values.clone()) } @@ -566,8 +591,10 @@ class DenseVector(val values: Array[Double]) extends Vector { result } + @Since("1.4.0") override def numActives: Int = size + @Since("1.4.0") override def numNonzeros: Int = { // same as values.count(_ != 0.0) but faster var nnz = 0 @@ -579,6 +606,7 @@ class DenseVector(val values: Array[Double]) extends Vector { nnz } + @Since("1.4.0") override def toSparse: SparseVector = { val nnz = numNonzeros val ii = new Array[Int](nnz) @@ -594,6 +622,7 @@ class DenseVector(val values: Array[Double]) extends Vector { new SparseVector(size, ii, vv) } + @Since("1.5.0") override def argmax: Int = { if (size == 0) { -1 @@ -613,8 +642,11 @@ class DenseVector(val values: Array[Double]) extends Vector { } } +@Since("1.3.0") object DenseVector { + /** Extracts the value array from a dense vector. */ + @Since("1.3.0") def unapply(dv: DenseVector): Option[Array[Double]] = Some(dv.values) } @@ -625,11 +657,12 @@ object DenseVector { * @param indices index array, assume to be strictly increasing. * @param values value array, must have the same length as the index array. */ +@Since("1.0.0") @SQLUserDefinedType(udt = classOf[VectorUDT]) -class SparseVector( - override val size: Int, - val indices: Array[Int], - val values: Array[Double]) extends Vector { +class SparseVector @Since("1.0.0") ( + @Since("1.0.0") override val size: Int, + @Since("1.0.0") val indices: Array[Int], + @Since("1.0.0") val values: Array[Double]) extends Vector { require(indices.length == values.length, "Sparse vectors require that the dimension of the" + s" indices match the dimension of the values. You provided ${indices.length} indices and " + @@ -640,6 +673,7 @@ class SparseVector( override def toString: String = s"($size,${indices.mkString("[", ",", "]")},${values.mkString("[", ",", "]")})" + @Since("1.0.0") override def toArray: Array[Double] = { val data = new Array[Double](size) var i = 0 @@ -651,6 +685,7 @@ class SparseVector( data } + @Since("1.1.0") override def copy: SparseVector = { new SparseVector(size, indices.clone(), values.clone()) } @@ -691,8 +726,10 @@ class SparseVector( result } + @Since("1.4.0") override def numActives: Int = values.length + @Since("1.4.0") override def numNonzeros: Int = { var nnz = 0 values.foreach { v => @@ -703,6 +740,7 @@ class SparseVector( nnz } + @Since("1.4.0") override def toSparse: SparseVector = { val nnz = numNonzeros if (nnz == numActives) { @@ -722,6 +760,7 @@ class SparseVector( } } + @Since("1.5.0") override def argmax: Int = { if (size == 0) { -1 @@ -766,9 +805,35 @@ class SparseVector( maxIdx } } + + /** + * Create a slice of this vector based on the given indices. + * @param selectedIndices Unsorted list of indices into the vector. + * This does NOT do bound checking. + * @return New SparseVector with values in the order specified by the given indices. + * + * NOTE: The API needs to be discussed before making this public. + * Also, if we have a version assuming indices are sorted, we should optimize it. + */ + private[spark] def slice(selectedIndices: Array[Int]): SparseVector = { + var currentIdx = 0 + val (sliceInds, sliceVals) = selectedIndices.flatMap { origIdx => + val iIdx = java.util.Arrays.binarySearch(this.indices, origIdx) + val i_v = if (iIdx >= 0) { + Iterator((currentIdx, this.values(iIdx))) + } else { + Iterator() + } + currentIdx += 1 + i_v + }.unzip + new SparseVector(selectedIndices.length, sliceInds.toArray, sliceVals.toArray) + } } +@Since("1.3.0") object SparseVector { + @Since("1.3.0") def unapply(sv: SparseVector): Option[(Int, Array[Int], Array[Double])] = Some((sv.size, sv.indices, sv.values)) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index 3323ae7b1fba..a33b6137cf9c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.{Logging, Partitioner, SparkException} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -129,11 +129,12 @@ private[mllib] object GridPartitioner { * @param nCols Number of columns of this matrix. If the supplied value is less than or equal to * zero, the number of columns will be calculated when `numCols` is invoked. */ +@Since("1.3.0") @Experimental -class BlockMatrix( - val blocks: RDD[((Int, Int), Matrix)], - val rowsPerBlock: Int, - val colsPerBlock: Int, +class BlockMatrix @Since("1.3.0") ( + @Since("1.3.0") val blocks: RDD[((Int, Int), Matrix)], + @Since("1.3.0") val rowsPerBlock: Int, + @Since("1.3.0") val colsPerBlock: Int, private var nRows: Long, private var nCols: Long) extends DistributedMatrix with Logging { @@ -150,6 +151,7 @@ class BlockMatrix( * @param colsPerBlock Number of columns that make up each block. The blocks forming the final * columns are not required to have the given number of columns */ + @Since("1.3.0") def this( blocks: RDD[((Int, Int), Matrix)], rowsPerBlock: Int, @@ -157,17 +159,21 @@ class BlockMatrix( this(blocks, rowsPerBlock, colsPerBlock, 0L, 0L) } + @Since("1.3.0") override def numRows(): Long = { if (nRows <= 0L) estimateDim() nRows } + @Since("1.3.0") override def numCols(): Long = { if (nCols <= 0L) estimateDim() nCols } + @Since("1.3.0") val numRowBlocks = math.ceil(numRows() * 1.0 / rowsPerBlock).toInt + @Since("1.3.0") val numColBlocks = math.ceil(numCols() * 1.0 / colsPerBlock).toInt private[mllib] def createPartitioner(): GridPartitioner = @@ -193,6 +199,7 @@ class BlockMatrix( * Validates the block matrix info against the matrix data (`blocks`) and throws an exception if * any error is found. */ + @Since("1.3.0") def validate(): Unit = { logDebug("Validating BlockMatrix...") // check if the matrix is larger than the claimed dimensions @@ -229,18 +236,21 @@ class BlockMatrix( } /** Caches the underlying RDD. */ + @Since("1.3.0") def cache(): this.type = { blocks.cache() this } /** Persists the underlying RDD with the specified storage level. */ + @Since("1.3.0") def persist(storageLevel: StorageLevel): this.type = { blocks.persist(storageLevel) this } /** Converts to CoordinateMatrix. */ + @Since("1.3.0") def toCoordinateMatrix(): CoordinateMatrix = { val entryRDD = blocks.flatMap { case ((blockRowIndex, blockColIndex), mat) => val rowStart = blockRowIndex.toLong * rowsPerBlock @@ -255,6 +265,7 @@ class BlockMatrix( } /** Converts to IndexedRowMatrix. The number of columns must be within the integer range. */ + @Since("1.3.0") def toIndexedRowMatrix(): IndexedRowMatrix = { require(numCols() < Int.MaxValue, "The number of columns must be within the integer range. " + s"numCols: ${numCols()}") @@ -263,6 +274,7 @@ class BlockMatrix( } /** Collect the distributed matrix on the driver as a `DenseMatrix`. */ + @Since("1.3.0") def toLocalMatrix(): Matrix = { require(numRows() < Int.MaxValue, "The number of rows of this matrix should be less than " + s"Int.MaxValue. Currently numRows: ${numRows()}") @@ -287,8 +299,11 @@ class BlockMatrix( new DenseMatrix(m, n, values) } - /** Transpose this `BlockMatrix`. Returns a new `BlockMatrix` instance sharing the - * same underlying data. Is a lazy operation. */ + /** + * Transpose this `BlockMatrix`. Returns a new `BlockMatrix` instance sharing the + * same underlying data. Is a lazy operation. + */ + @Since("1.3.0") def transpose: BlockMatrix = { val transposedBlocks = blocks.map { case ((blockRowIndex, blockColIndex), mat) => ((blockColIndex, blockRowIndex), mat.transpose) @@ -302,12 +317,14 @@ class BlockMatrix( new BDM[Double](localMat.numRows, localMat.numCols, localMat.toArray) } - /** Adds two block matrices together. The matrices must have the same size and matching - * `rowsPerBlock` and `colsPerBlock` values. If one of the blocks that are being added are - * instances of [[SparseMatrix]], the resulting sub matrix will also be a [[SparseMatrix]], even - * if it is being added to a [[DenseMatrix]]. If two dense matrices are added, the output will - * also be a [[DenseMatrix]]. - */ + /** + * Adds two block matrices together. The matrices must have the same size and matching + * `rowsPerBlock` and `colsPerBlock` values. If one of the blocks that are being added are + * instances of [[SparseMatrix]], the resulting sub matrix will also be a [[SparseMatrix]], even + * if it is being added to a [[DenseMatrix]]. If two dense matrices are added, the output will + * also be a [[DenseMatrix]]. + */ + @Since("1.3.0") def add(other: BlockMatrix): BlockMatrix = { require(numRows() == other.numRows(), "Both matrices must have the same number of rows. " + s"A.numRows: ${numRows()}, B.numRows: ${other.numRows()}") @@ -335,12 +352,14 @@ class BlockMatrix( } } - /** Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock` - * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains - * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output - * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause - * some performance issues until support for multiplying two sparse matrices is added. - */ + /** + * Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock` + * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains + * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output + * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause + * some performance issues until support for multiplying two sparse matrices is added. + */ + @Since("1.3.0") def multiply(other: BlockMatrix): BlockMatrix = { require(numCols() == other.numRows(), "The number of columns of A and the number of rows " + s"of B must be equal. A.numCols: ${numCols()}, B.numRows: ${other.numRows()}. If you " + diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala index 078d1fac4444..644f293d88a7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.{Matrix, SparseMatrix, Vectors} @@ -30,6 +30,7 @@ import org.apache.spark.mllib.linalg.{Matrix, SparseMatrix, Vectors} * @param j column index * @param value value of the entry */ +@Since("1.0.0") @Experimental case class MatrixEntry(i: Long, j: Long, value: Double) @@ -43,16 +44,19 @@ case class MatrixEntry(i: Long, j: Long, value: Double) * @param nCols number of columns. A non-positive value means unknown, and then the number of * columns will be determined by the max column index plus one. */ +@Since("1.0.0") @Experimental -class CoordinateMatrix( - val entries: RDD[MatrixEntry], +class CoordinateMatrix @Since("1.0.0") ( + @Since("1.0.0") val entries: RDD[MatrixEntry], private var nRows: Long, private var nCols: Long) extends DistributedMatrix { /** Alternative constructor leaving matrix dimensions to be determined automatically. */ + @Since("1.0.0") def this(entries: RDD[MatrixEntry]) = this(entries, 0L, 0L) /** Gets or computes the number of columns. */ + @Since("1.0.0") override def numCols(): Long = { if (nCols <= 0L) { computeSize() @@ -61,6 +65,7 @@ class CoordinateMatrix( } /** Gets or computes the number of rows. */ + @Since("1.0.0") override def numRows(): Long = { if (nRows <= 0L) { computeSize() @@ -69,11 +74,13 @@ class CoordinateMatrix( } /** Transposes this CoordinateMatrix. */ + @Since("1.3.0") def transpose(): CoordinateMatrix = { new CoordinateMatrix(entries.map(x => MatrixEntry(x.j, x.i, x.value)), numCols(), numRows()) } /** Converts to IndexedRowMatrix. The number of columns must be within the integer range. */ + @Since("1.0.0") def toIndexedRowMatrix(): IndexedRowMatrix = { val nl = numCols() if (nl > Int.MaxValue) { @@ -93,11 +100,13 @@ class CoordinateMatrix( * Converts to RowMatrix, dropping row indices after grouping by row index. * The number of columns must be within the integer range. */ + @Since("1.0.0") def toRowMatrix(): RowMatrix = { toIndexedRowMatrix().toRowMatrix() } /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */ + @Since("1.3.0") def toBlockMatrix(): BlockMatrix = { toBlockMatrix(1024, 1024) } @@ -110,6 +119,7 @@ class CoordinateMatrix( * a smaller value. Must be an integer value greater than 0. * @return a [[BlockMatrix]] */ + @Since("1.3.0") def toBlockMatrix(rowsPerBlock: Int, colsPerBlock: Int): BlockMatrix = { require(rowsPerBlock > 0, s"rowsPerBlock needs to be greater than 0. rowsPerBlock: $rowsPerBlock") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala index a0e26ce3bc46..db3433a5e245 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala @@ -19,15 +19,20 @@ package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} +import org.apache.spark.annotation.Since + /** * Represents a distributively stored matrix backed by one or more RDDs. */ +@Since("1.0.0") trait DistributedMatrix extends Serializable { /** Gets or computes the number of rows. */ + @Since("1.0.0") def numRows(): Long /** Gets or computes the number of columns. */ + @Since("1.0.0") def numCols(): Long /** Collects data and assembles a local dense breeze matrix (for test only). */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala index 1c33b43ea7a8..b20ea0dc50da 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.linalg.SingularValueDecomposition @@ -28,6 +28,7 @@ import org.apache.spark.mllib.linalg.SingularValueDecomposition * :: Experimental :: * Represents a row of [[org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix]]. */ +@Since("1.0.0") @Experimental case class IndexedRow(index: Long, vector: Vector) @@ -42,15 +43,18 @@ case class IndexedRow(index: Long, vector: Vector) * @param nCols number of columns. A non-positive value means unknown, and then the number of * columns will be determined by the size of the first row. */ +@Since("1.0.0") @Experimental -class IndexedRowMatrix( - val rows: RDD[IndexedRow], +class IndexedRowMatrix @Since("1.0.0") ( + @Since("1.0.0") val rows: RDD[IndexedRow], private var nRows: Long, private var nCols: Int) extends DistributedMatrix { /** Alternative constructor leaving matrix dimensions to be determined automatically. */ + @Since("1.0.0") def this(rows: RDD[IndexedRow]) = this(rows, 0L, 0) + @Since("1.0.0") override def numCols(): Long = { if (nCols <= 0) { // Calling `first` will throw an exception if `rows` is empty. @@ -59,6 +63,7 @@ class IndexedRowMatrix( nCols } + @Since("1.0.0") override def numRows(): Long = { if (nRows <= 0L) { // Reduce will throw an exception if `rows` is empty. @@ -71,11 +76,13 @@ class IndexedRowMatrix( * Drops row indices and converts this matrix to a * [[org.apache.spark.mllib.linalg.distributed.RowMatrix]]. */ + @Since("1.0.0") def toRowMatrix(): RowMatrix = { new RowMatrix(rows.map(_.vector), 0L, nCols) } /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */ + @Since("1.3.0") def toBlockMatrix(): BlockMatrix = { toBlockMatrix(1024, 1024) } @@ -88,6 +95,7 @@ class IndexedRowMatrix( * a smaller value. Must be an integer value greater than 0. * @return a [[BlockMatrix]] */ + @Since("1.3.0") def toBlockMatrix(rowsPerBlock: Int, colsPerBlock: Int): BlockMatrix = { // TODO: This implementation may be optimized toCoordinateMatrix().toBlockMatrix(rowsPerBlock, colsPerBlock) @@ -97,6 +105,7 @@ class IndexedRowMatrix( * Converts this matrix to a * [[org.apache.spark.mllib.linalg.distributed.CoordinateMatrix]]. */ + @Since("1.3.0") def toCoordinateMatrix(): CoordinateMatrix = { val entries = rows.flatMap { row => val rowIndex = row.index @@ -133,6 +142,7 @@ class IndexedRowMatrix( * are treated as zero, where sigma(0) is the largest singular value. * @return SingularValueDecomposition(U, s, V) */ + @Since("1.0.0") def computeSVD( k: Int, computeU: Boolean = false, @@ -159,6 +169,7 @@ class IndexedRowMatrix( * @param B a local matrix whose number of rows must match the number of columns of this matrix * @return an IndexedRowMatrix representing the product, which preserves partitioning */ + @Since("1.0.0") def multiply(B: Matrix): IndexedRowMatrix = { val mat = toRowMatrix().multiply(B) val indexedRows = rows.map(_.index).zip(mat.rows).map { case (i, v) => @@ -170,6 +181,7 @@ class IndexedRowMatrix( /** * Computes the Gramian matrix `A^T A`. */ + @Since("1.0.0") def computeGramianMatrix(): Matrix = { toRowMatrix().computeGramianMatrix() } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index bfc90c9ef852..9a423ddafdc0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -28,7 +28,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.Logging import org.apache.spark.SparkContext._ -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, MultivariateStatisticalSummary} import org.apache.spark.rdd.RDD @@ -45,16 +45,19 @@ import org.apache.spark.storage.StorageLevel * @param nCols number of columns. A non-positive value means unknown, and then the number of * columns will be determined by the size of the first row. */ +@Since("1.0.0") @Experimental -class RowMatrix( - val rows: RDD[Vector], +class RowMatrix @Since("1.0.0") ( + @Since("1.0.0") val rows: RDD[Vector], private var nRows: Long, private var nCols: Int) extends DistributedMatrix with Logging { /** Alternative constructor leaving matrix dimensions to be determined automatically. */ + @Since("1.0.0") def this(rows: RDD[Vector]) = this(rows, 0L, 0) /** Gets or computes the number of columns. */ + @Since("1.0.0") override def numCols(): Long = { if (nCols <= 0) { try { @@ -70,6 +73,7 @@ class RowMatrix( } /** Gets or computes the number of rows. */ + @Since("1.0.0") override def numRows(): Long = { if (nRows <= 0L) { nRows = rows.count() @@ -108,6 +112,7 @@ class RowMatrix( /** * Computes the Gramian matrix `A^T A`. */ + @Since("1.0.0") def computeGramianMatrix(): Matrix = { val n = numCols().toInt checkNumColumns(n) @@ -178,6 +183,7 @@ class RowMatrix( * are treated as zero, where sigma(0) is the largest singular value. * @return SingularValueDecomposition(U, s, V). U = null if computeU = false. */ + @Since("1.0.0") def computeSVD( k: Int, computeU: Boolean = false, @@ -318,6 +324,7 @@ class RowMatrix( * Computes the covariance matrix, treating each row as an observation. * @return a local dense matrix of size n x n */ + @Since("1.0.0") def computeCovariance(): Matrix = { val n = numCols().toInt checkNumColumns(n) @@ -371,6 +378,7 @@ class RowMatrix( * @param k number of top principal components. * @return a matrix of size n-by-k, whose columns are principal components */ + @Since("1.0.0") def computePrincipalComponents(k: Int): Matrix = { val n = numCols().toInt require(k > 0 && k <= n, s"k = $k out of range (0, n = $n]") @@ -389,6 +397,7 @@ class RowMatrix( /** * Computes column-wise summary statistics. */ + @Since("1.0.0") def computeColumnSummaryStatistics(): MultivariateStatisticalSummary = { val summary = rows.treeAggregate(new MultivariateOnlineSummarizer)( (aggregator, data) => aggregator.add(data), @@ -404,6 +413,7 @@ class RowMatrix( * @return a [[org.apache.spark.mllib.linalg.distributed.RowMatrix]] representing the product, * which preserves partitioning */ + @Since("1.0.0") def multiply(B: Matrix): RowMatrix = { val n = numCols().toInt val k = B.numCols @@ -436,6 +446,7 @@ class RowMatrix( * @return An n x n sparse upper-triangular matrix of cosine similarities between * columns of this matrix. */ + @Since("1.2.0") def columnSimilarities(): CoordinateMatrix = { columnSimilarities(0.0) } @@ -479,6 +490,7 @@ class RowMatrix( * @return An n x n sparse upper-triangular matrix of cosine similarities * between columns of this matrix. */ + @Since("1.2.0") def columnSimilarities(threshold: Double): CoordinateMatrix = { require(threshold >= 0, s"Threshold cannot be negative: $threshold") @@ -507,6 +519,7 @@ class RowMatrix( * @param computeQ whether to computeQ * @return QRDecomposition(Q, R), Q = null if computeQ = false. */ + @Since("1.5.0") def tallSkinnyQR(computeQ: Boolean = false): QRDecomposition[RowMatrix, Matrix] = { val col = numCols().toInt // split rows horizontally into smaller matrices, and compute QR for each of them @@ -656,6 +669,7 @@ class RowMatrix( } } +@Since("1.0.0") @Experimental object RowMatrix { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index 8f0d1e4aa010..3b663b5defb0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -235,7 +235,7 @@ object GradientDescent extends Logging { if (miniBatchSize > 0) { /** - * NOTE(Xinghao): lossSum is computed using the weights from the previous iteration + * lossSum is computed using the weights from the previous iteration * and regVal is the regularization value computed in the previous iteration as well. */ stochasticLossHistory.append(lossSum / miniBatchSize + regVal) @@ -264,6 +264,9 @@ object GradientDescent extends Logging { } + /** + * Alias of [[runMiniBatchSGD]] with convergenceTol set to default value of 0.001. + */ def runMiniBatchSGD( data: RDD[(Double, Vector)], gradient: Gradient, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala index 5e882d4ebb10..274ac7c99553 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala @@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult import org.jpmml.model.JAXBUtil import org.apache.spark.SparkContext -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory /** @@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory * developed by the Data Mining Group (www.dmg.org). */ @DeveloperApi +@Since("1.4.0") trait PMMLExportable { /** @@ -48,6 +49,7 @@ trait PMMLExportable { * Export the model to a local file in PMML format */ @Experimental + @Since("1.4.0") def toPMML(localPath: String): Unit = { toPMML(new StreamResult(new File(localPath))) } @@ -57,6 +59,7 @@ trait PMMLExportable { * Export the model to a directory on a distributed file system in PMML format */ @Experimental + @Since("1.4.0") def toPMML(sc: SparkContext, path: String): Unit = { val pmml = toPMML() sc.parallelize(Array(pmml), 1).saveAsTextFile(path) @@ -67,6 +70,7 @@ trait PMMLExportable { * Export the model to the OutputStream in PMML format */ @Experimental + @Since("1.4.0") def toPMML(outputStream: OutputStream): Unit = { toPMML(new StreamResult(outputStream)) } @@ -76,6 +80,7 @@ trait PMMLExportable { * Export the model to a String in PMML format */ @Experimental + @Since("1.4.0") def toPMML(): String = { val writer = new StringWriter toPMML(new StreamResult(writer)) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala index 9349ecaa13f5..a2d85a68cd32 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.random import org.apache.commons.math3.distribution.{ExponentialDistribution, GammaDistribution, LogNormalDistribution, PoissonDistribution} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{Since, DeveloperApi} import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom} /** @@ -28,17 +28,20 @@ import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom} * Trait for random data generators that generate i.i.d. data. */ @DeveloperApi +@Since("1.1.0") trait RandomDataGenerator[T] extends Pseudorandom with Serializable { /** * Returns an i.i.d. sample as a generic type from an underlying distribution. */ + @Since("1.1.0") def nextValue(): T /** * Returns a copy of the RandomDataGenerator with a new instance of the rng object used in the * class when applicable for non-locking concurrent usage. */ + @Since("1.1.0") def copy(): RandomDataGenerator[T] } @@ -47,17 +50,21 @@ trait RandomDataGenerator[T] extends Pseudorandom with Serializable { * Generates i.i.d. samples from U[0.0, 1.0] */ @DeveloperApi +@Since("1.1.0") class UniformGenerator extends RandomDataGenerator[Double] { // XORShiftRandom for better performance. Thread safety isn't necessary here. private val random = new XORShiftRandom() + @Since("1.1.0") override def nextValue(): Double = { random.nextDouble() } + @Since("1.1.0") override def setSeed(seed: Long): Unit = random.setSeed(seed) + @Since("1.1.0") override def copy(): UniformGenerator = new UniformGenerator() } @@ -66,17 +73,21 @@ class UniformGenerator extends RandomDataGenerator[Double] { * Generates i.i.d. samples from the standard normal distribution. */ @DeveloperApi +@Since("1.1.0") class StandardNormalGenerator extends RandomDataGenerator[Double] { // XORShiftRandom for better performance. Thread safety isn't necessary here. private val random = new XORShiftRandom() + @Since("1.1.0") override def nextValue(): Double = { random.nextGaussian() } + @Since("1.1.0") override def setSeed(seed: Long): Unit = random.setSeed(seed) + @Since("1.1.0") override def copy(): StandardNormalGenerator = new StandardNormalGenerator() } @@ -87,16 +98,21 @@ class StandardNormalGenerator extends RandomDataGenerator[Double] { * @param mean mean for the Poisson distribution. */ @DeveloperApi -class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] { +@Since("1.1.0") +class PoissonGenerator @Since("1.1.0") ( + @Since("1.1.0") val mean: Double) extends RandomDataGenerator[Double] { private val rng = new PoissonDistribution(mean) + @Since("1.1.0") override def nextValue(): Double = rng.sample() + @Since("1.1.0") override def setSeed(seed: Long) { rng.reseedRandomGenerator(seed) } + @Since("1.1.0") override def copy(): PoissonGenerator = new PoissonGenerator(mean) } @@ -107,16 +123,21 @@ class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] { * @param mean mean for the exponential distribution. */ @DeveloperApi -class ExponentialGenerator(val mean: Double) extends RandomDataGenerator[Double] { +@Since("1.3.0") +class ExponentialGenerator @Since("1.3.0") ( + @Since("1.3.0") val mean: Double) extends RandomDataGenerator[Double] { private val rng = new ExponentialDistribution(mean) + @Since("1.3.0") override def nextValue(): Double = rng.sample() + @Since("1.3.0") override def setSeed(seed: Long) { rng.reseedRandomGenerator(seed) } + @Since("1.3.0") override def copy(): ExponentialGenerator = new ExponentialGenerator(mean) } @@ -128,16 +149,22 @@ class ExponentialGenerator(val mean: Double) extends RandomDataGenerator[Double] * @param scale scale for the gamma distribution */ @DeveloperApi -class GammaGenerator(val shape: Double, val scale: Double) extends RandomDataGenerator[Double] { +@Since("1.3.0") +class GammaGenerator @Since("1.3.0") ( + @Since("1.3.0") val shape: Double, + @Since("1.3.0") val scale: Double) extends RandomDataGenerator[Double] { private val rng = new GammaDistribution(shape, scale) + @Since("1.3.0") override def nextValue(): Double = rng.sample() + @Since("1.3.0") override def setSeed(seed: Long) { rng.reseedRandomGenerator(seed) } + @Since("1.3.0") override def copy(): GammaGenerator = new GammaGenerator(shape, scale) } @@ -150,15 +177,21 @@ class GammaGenerator(val shape: Double, val scale: Double) extends RandomDataGen * @param std standard deviation for the log normal distribution */ @DeveloperApi -class LogNormalGenerator(val mean: Double, val std: Double) extends RandomDataGenerator[Double] { +@Since("1.3.0") +class LogNormalGenerator @Since("1.3.0") ( + @Since("1.3.0") val mean: Double, + @Since("1.3.0") val std: Double) extends RandomDataGenerator[Double] { private val rng = new LogNormalDistribution(mean, std) + @Since("1.3.0") override def nextValue(): Double = rng.sample() + @Since("1.3.0") override def setSeed(seed: Long) { rng.reseedRandomGenerator(seed) } + @Since("1.3.0") override def copy(): LogNormalGenerator = new LogNormalGenerator(mean, std) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala index 174d5e0f6c9f..4dd5ea214d67 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.random import scala.reflect.ClassTag import org.apache.spark.SparkContext -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD, JavaSparkContext} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.rdd.{RandomRDD, RandomVectorRDD} @@ -32,6 +32,7 @@ import org.apache.spark.util.Utils * Generator methods for creating RDDs comprised of `i.i.d.` samples from some distribution. */ @Experimental +@Since("1.1.0") object RandomRDDs { /** @@ -46,6 +47,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Double] comprised of `i.i.d.` samples ~ `U(0.0, 1.0)`. */ + @Since("1.1.0") def uniformRDD( sc: SparkContext, size: Long, @@ -58,6 +60,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#uniformRDD]]. */ + @Since("1.1.0") def uniformJavaRDD( jsc: JavaSparkContext, size: Long, @@ -69,6 +72,7 @@ object RandomRDDs { /** * [[RandomRDDs#uniformJavaRDD]] with the default seed. */ + @Since("1.1.0") def uniformJavaRDD(jsc: JavaSparkContext, size: Long, numPartitions: Int): JavaDoubleRDD = { JavaDoubleRDD.fromRDD(uniformRDD(jsc.sc, size, numPartitions)) } @@ -76,6 +80,7 @@ object RandomRDDs { /** * [[RandomRDDs#uniformJavaRDD]] with the default number of partitions and the default seed. */ + @Since("1.1.0") def uniformJavaRDD(jsc: JavaSparkContext, size: Long): JavaDoubleRDD = { JavaDoubleRDD.fromRDD(uniformRDD(jsc.sc, size)) } @@ -92,6 +97,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Double] comprised of `i.i.d.` samples ~ N(0.0, 1.0). */ + @Since("1.1.0") def normalRDD( sc: SparkContext, size: Long, @@ -104,6 +110,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#normalRDD]]. */ + @Since("1.1.0") def normalJavaRDD( jsc: JavaSparkContext, size: Long, @@ -115,6 +122,7 @@ object RandomRDDs { /** * [[RandomRDDs#normalJavaRDD]] with the default seed. */ + @Since("1.1.0") def normalJavaRDD(jsc: JavaSparkContext, size: Long, numPartitions: Int): JavaDoubleRDD = { JavaDoubleRDD.fromRDD(normalRDD(jsc.sc, size, numPartitions)) } @@ -122,6 +130,7 @@ object RandomRDDs { /** * [[RandomRDDs#normalJavaRDD]] with the default number of partitions and the default seed. */ + @Since("1.1.0") def normalJavaRDD(jsc: JavaSparkContext, size: Long): JavaDoubleRDD = { JavaDoubleRDD.fromRDD(normalRDD(jsc.sc, size)) } @@ -137,6 +146,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean). */ + @Since("1.1.0") def poissonRDD( sc: SparkContext, mean: Double, @@ -150,6 +160,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#poissonRDD]]. */ + @Since("1.1.0") def poissonJavaRDD( jsc: JavaSparkContext, mean: Double, @@ -162,6 +173,7 @@ object RandomRDDs { /** * [[RandomRDDs#poissonJavaRDD]] with the default seed. */ + @Since("1.1.0") def poissonJavaRDD( jsc: JavaSparkContext, mean: Double, @@ -173,6 +185,7 @@ object RandomRDDs { /** * [[RandomRDDs#poissonJavaRDD]] with the default number of partitions and the default seed. */ + @Since("1.1.0") def poissonJavaRDD(jsc: JavaSparkContext, mean: Double, size: Long): JavaDoubleRDD = { JavaDoubleRDD.fromRDD(poissonRDD(jsc.sc, mean, size)) } @@ -188,6 +201,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean). */ + @Since("1.3.0") def exponentialRDD( sc: SparkContext, mean: Double, @@ -201,6 +215,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#exponentialRDD]]. */ + @Since("1.3.0") def exponentialJavaRDD( jsc: JavaSparkContext, mean: Double, @@ -213,6 +228,7 @@ object RandomRDDs { /** * [[RandomRDDs#exponentialJavaRDD]] with the default seed. */ + @Since("1.3.0") def exponentialJavaRDD( jsc: JavaSparkContext, mean: Double, @@ -224,6 +240,7 @@ object RandomRDDs { /** * [[RandomRDDs#exponentialJavaRDD]] with the default number of partitions and the default seed. */ + @Since("1.3.0") def exponentialJavaRDD(jsc: JavaSparkContext, mean: Double, size: Long): JavaDoubleRDD = { JavaDoubleRDD.fromRDD(exponentialRDD(jsc.sc, mean, size)) } @@ -240,6 +257,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean). */ + @Since("1.3.0") def gammaRDD( sc: SparkContext, shape: Double, @@ -254,6 +272,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#gammaRDD]]. */ + @Since("1.3.0") def gammaJavaRDD( jsc: JavaSparkContext, shape: Double, @@ -267,6 +286,7 @@ object RandomRDDs { /** * [[RandomRDDs#gammaJavaRDD]] with the default seed. */ + @Since("1.3.0") def gammaJavaRDD( jsc: JavaSparkContext, shape: Double, @@ -279,11 +299,12 @@ object RandomRDDs { /** * [[RandomRDDs#gammaJavaRDD]] with the default number of partitions and the default seed. */ + @Since("1.3.0") def gammaJavaRDD( - jsc: JavaSparkContext, - shape: Double, - scale: Double, - size: Long): JavaDoubleRDD = { + jsc: JavaSparkContext, + shape: Double, + scale: Double, + size: Long): JavaDoubleRDD = { JavaDoubleRDD.fromRDD(gammaRDD(jsc.sc, shape, scale, size)) } @@ -299,6 +320,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean). */ + @Since("1.3.0") def logNormalRDD( sc: SparkContext, mean: Double, @@ -313,6 +335,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#logNormalRDD]]. */ + @Since("1.3.0") def logNormalJavaRDD( jsc: JavaSparkContext, mean: Double, @@ -326,6 +349,7 @@ object RandomRDDs { /** * [[RandomRDDs#logNormalJavaRDD]] with the default seed. */ + @Since("1.3.0") def logNormalJavaRDD( jsc: JavaSparkContext, mean: Double, @@ -338,11 +362,12 @@ object RandomRDDs { /** * [[RandomRDDs#logNormalJavaRDD]] with the default number of partitions and the default seed. */ + @Since("1.3.0") def logNormalJavaRDD( - jsc: JavaSparkContext, - mean: Double, - std: Double, - size: Long): JavaDoubleRDD = { + jsc: JavaSparkContext, + mean: Double, + std: Double, + size: Long): JavaDoubleRDD = { JavaDoubleRDD.fromRDD(logNormalRDD(jsc.sc, mean, std, size)) } @@ -359,6 +384,7 @@ object RandomRDDs { * @return RDD[Double] comprised of `i.i.d.` samples produced by generator. */ @DeveloperApi + @Since("1.1.0") def randomRDD[T: ClassTag]( sc: SparkContext, generator: RandomDataGenerator[T], @@ -381,6 +407,7 @@ object RandomRDDs { * @param seed Seed for the RNG that generates the seed for the generator in each partition. * @return RDD[Vector] with vectors containing i.i.d samples ~ `U(0.0, 1.0)`. */ + @Since("1.1.0") def uniformVectorRDD( sc: SparkContext, numRows: Long, @@ -394,6 +421,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#uniformVectorRDD]]. */ + @Since("1.1.0") def uniformJavaVectorRDD( jsc: JavaSparkContext, numRows: Long, @@ -406,6 +434,7 @@ object RandomRDDs { /** * [[RandomRDDs#uniformJavaVectorRDD]] with the default seed. */ + @Since("1.1.0") def uniformJavaVectorRDD( jsc: JavaSparkContext, numRows: Long, @@ -417,6 +446,7 @@ object RandomRDDs { /** * [[RandomRDDs#uniformJavaVectorRDD]] with the default number of partitions and the default seed. */ + @Since("1.1.0") def uniformJavaVectorRDD( jsc: JavaSparkContext, numRows: Long, @@ -435,6 +465,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ `N(0.0, 1.0)`. */ + @Since("1.1.0") def normalVectorRDD( sc: SparkContext, numRows: Long, @@ -448,6 +479,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#normalVectorRDD]]. */ + @Since("1.1.0") def normalJavaVectorRDD( jsc: JavaSparkContext, numRows: Long, @@ -460,6 +492,7 @@ object RandomRDDs { /** * [[RandomRDDs#normalJavaVectorRDD]] with the default seed. */ + @Since("1.1.0") def normalJavaVectorRDD( jsc: JavaSparkContext, numRows: Long, @@ -471,6 +504,7 @@ object RandomRDDs { /** * [[RandomRDDs#normalJavaVectorRDD]] with the default number of partitions and the default seed. */ + @Since("1.1.0") def normalJavaVectorRDD( jsc: JavaSparkContext, numRows: Long, @@ -491,6 +525,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Vector] with vectors containing `i.i.d.` samples. */ + @Since("1.3.0") def logNormalVectorRDD( sc: SparkContext, mean: Double, @@ -507,6 +542,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#logNormalVectorRDD]]. */ + @Since("1.3.0") def logNormalJavaVectorRDD( jsc: JavaSparkContext, mean: Double, @@ -521,6 +557,7 @@ object RandomRDDs { /** * [[RandomRDDs#logNormalJavaVectorRDD]] with the default seed. */ + @Since("1.3.0") def logNormalJavaVectorRDD( jsc: JavaSparkContext, mean: Double, @@ -535,6 +572,7 @@ object RandomRDDs { * [[RandomRDDs#logNormalJavaVectorRDD]] with the default number of partitions and * the default seed. */ + @Since("1.3.0") def logNormalJavaVectorRDD( jsc: JavaSparkContext, mean: Double, @@ -556,6 +594,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Pois(mean). */ + @Since("1.1.0") def poissonVectorRDD( sc: SparkContext, mean: Double, @@ -570,6 +609,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#poissonVectorRDD]]. */ + @Since("1.1.0") def poissonJavaVectorRDD( jsc: JavaSparkContext, mean: Double, @@ -583,6 +623,7 @@ object RandomRDDs { /** * [[RandomRDDs#poissonJavaVectorRDD]] with the default seed. */ + @Since("1.1.0") def poissonJavaVectorRDD( jsc: JavaSparkContext, mean: Double, @@ -595,6 +636,7 @@ object RandomRDDs { /** * [[RandomRDDs#poissonJavaVectorRDD]] with the default number of partitions and the default seed. */ + @Since("1.1.0") def poissonJavaVectorRDD( jsc: JavaSparkContext, mean: Double, @@ -615,6 +657,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Exp(mean). */ + @Since("1.3.0") def exponentialVectorRDD( sc: SparkContext, mean: Double, @@ -630,6 +673,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#exponentialVectorRDD]]. */ + @Since("1.3.0") def exponentialJavaVectorRDD( jsc: JavaSparkContext, mean: Double, @@ -643,6 +687,7 @@ object RandomRDDs { /** * [[RandomRDDs#exponentialJavaVectorRDD]] with the default seed. */ + @Since("1.3.0") def exponentialJavaVectorRDD( jsc: JavaSparkContext, mean: Double, @@ -656,6 +701,7 @@ object RandomRDDs { * [[RandomRDDs#exponentialJavaVectorRDD]] with the default number of partitions * and the default seed. */ + @Since("1.3.0") def exponentialJavaVectorRDD( jsc: JavaSparkContext, mean: Double, @@ -678,6 +724,7 @@ object RandomRDDs { * @param seed Random seed (default: a random long integer). * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Exp(mean). */ + @Since("1.3.0") def gammaVectorRDD( sc: SparkContext, shape: Double, @@ -693,6 +740,7 @@ object RandomRDDs { /** * Java-friendly version of [[RandomRDDs#gammaVectorRDD]]. */ + @Since("1.3.0") def gammaJavaVectorRDD( jsc: JavaSparkContext, shape: Double, @@ -707,6 +755,7 @@ object RandomRDDs { /** * [[RandomRDDs#gammaJavaVectorRDD]] with the default seed. */ + @Since("1.3.0") def gammaJavaVectorRDD( jsc: JavaSparkContext, shape: Double, @@ -720,6 +769,7 @@ object RandomRDDs { /** * [[RandomRDDs#gammaJavaVectorRDD]] with the default number of partitions and the default seed. */ + @Since("1.3.0") def gammaJavaVectorRDD( jsc: JavaSparkContext, shape: Double, @@ -744,6 +794,7 @@ object RandomRDDs { * @return RDD[Vector] with vectors containing `i.i.d.` samples produced by generator. */ @DeveloperApi + @Since("1.1.0") def randomVectorRDD(sc: SparkContext, generator: RandomDataGenerator[Double], numRows: Long, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala index 56c549ef99cb..33aaf853e599 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala @@ -18,7 +18,7 @@ package org.apache.spark.mllib.recommendation import org.apache.spark.Logging -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.ml.recommendation.{ALS => NewALS} import org.apache.spark.rdd.RDD @@ -26,9 +26,12 @@ import org.apache.spark.storage.StorageLevel /** * A more compact class to represent a rating than Tuple3[Int, Int, Double]. - * @since 0.8.0 */ -case class Rating(user: Int, product: Int, rating: Double) +@Since("0.8.0") +case class Rating @Since("0.8.0") ( + @Since("0.8.0") user: Int, + @Since("0.8.0") product: Int, + @Since("0.8.0") rating: Double) /** * Alternating Least Squares matrix factorization. @@ -59,6 +62,7 @@ case class Rating(user: Int, product: Int, rating: Double) * indicated user * preferences rather than explicit ratings given to items. */ +@Since("0.8.0") class ALS private ( private var numUserBlocks: Int, private var numProductBlocks: Int, @@ -74,6 +78,7 @@ class ALS private ( * Constructs an ALS instance with default parameters: {numBlocks: -1, rank: 10, iterations: 10, * lambda: 0.01, implicitPrefs: false, alpha: 1.0}. */ + @Since("0.8.0") def this() = this(-1, -1, 10, 10, 0.01, false, 1.0) /** If true, do alternating nonnegative least squares. */ @@ -90,6 +95,7 @@ class ALS private ( * Set the number of blocks for both user blocks and product blocks to parallelize the computation * into; pass -1 for an auto-configured number of blocks. Default: -1. */ + @Since("0.8.0") def setBlocks(numBlocks: Int): this.type = { this.numUserBlocks = numBlocks this.numProductBlocks = numBlocks @@ -99,6 +105,7 @@ class ALS private ( /** * Set the number of user blocks to parallelize the computation. */ + @Since("1.1.0") def setUserBlocks(numUserBlocks: Int): this.type = { this.numUserBlocks = numUserBlocks this @@ -107,30 +114,35 @@ class ALS private ( /** * Set the number of product blocks to parallelize the computation. */ + @Since("1.1.0") def setProductBlocks(numProductBlocks: Int): this.type = { this.numProductBlocks = numProductBlocks this } /** Set the rank of the feature matrices computed (number of features). Default: 10. */ + @Since("0.8.0") def setRank(rank: Int): this.type = { this.rank = rank this } /** Set the number of iterations to run. Default: 10. */ + @Since("0.8.0") def setIterations(iterations: Int): this.type = { this.iterations = iterations this } /** Set the regularization parameter, lambda. Default: 0.01. */ + @Since("0.8.0") def setLambda(lambda: Double): this.type = { this.lambda = lambda this } /** Sets whether to use implicit preference. Default: false. */ + @Since("0.8.1") def setImplicitPrefs(implicitPrefs: Boolean): this.type = { this.implicitPrefs = implicitPrefs this @@ -139,12 +151,14 @@ class ALS private ( /** * Sets the constant used in computing confidence in implicit ALS. Default: 1.0. */ + @Since("0.8.1") def setAlpha(alpha: Double): this.type = { this.alpha = alpha this } /** Sets a random seed to have deterministic results. */ + @Since("1.0.0") def setSeed(seed: Long): this.type = { this.seed = seed this @@ -154,6 +168,7 @@ class ALS private ( * Set whether the least-squares problems solved at each iteration should have * nonnegativity constraints. */ + @Since("1.1.0") def setNonnegative(b: Boolean): this.type = { this.nonnegative = b this @@ -166,6 +181,7 @@ class ALS private ( * set `spark.rdd.compress` to `true` to reduce the space requirement, at the cost of speed. */ @DeveloperApi + @Since("1.1.0") def setIntermediateRDDStorageLevel(storageLevel: StorageLevel): this.type = { require(storageLevel != StorageLevel.NONE, "ALS is not designed to run without persisting intermediate RDDs.") @@ -181,6 +197,7 @@ class ALS private ( * at the cost of speed. */ @DeveloperApi + @Since("1.3.0") def setFinalRDDStorageLevel(storageLevel: StorageLevel): this.type = { this.finalRDDStorageLevel = storageLevel this @@ -194,6 +211,7 @@ class ALS private ( * this setting is ignored. */ @DeveloperApi + @Since("1.4.0") def setCheckpointInterval(checkpointInterval: Int): this.type = { this.checkpointInterval = checkpointInterval this @@ -203,6 +221,7 @@ class ALS private ( * Run ALS with the configured parameters on an input RDD of (user, product, rating) triples. * Returns a MatrixFactorizationModel with feature vectors for each user and product. */ + @Since("0.8.0") def run(ratings: RDD[Rating]): MatrixFactorizationModel = { val sc = ratings.context @@ -250,13 +269,14 @@ class ALS private ( /** * Java-friendly version of [[ALS.run]]. */ + @Since("1.3.0") def run(ratings: JavaRDD[Rating]): MatrixFactorizationModel = run(ratings.rdd) } /** * Top-level methods for calling Alternating Least Squares (ALS) matrix factorization. - * @since 0.8.0 */ +@Since("0.8.0") object ALS { /** * Train a matrix factorization model given an RDD of ratings given by users to some products, @@ -271,8 +291,8 @@ object ALS { * @param lambda regularization factor (recommended: 0.01) * @param blocks level of parallelism to split computation into * @param seed random seed - * @since 0.9.1 */ + @Since("0.9.1") def train( ratings: RDD[Rating], rank: Int, @@ -296,8 +316,8 @@ object ALS { * @param iterations number of iterations of ALS (recommended: 10-20) * @param lambda regularization factor (recommended: 0.01) * @param blocks level of parallelism to split computation into - * @since 0.8.0 */ + @Since("0.8.0") def train( ratings: RDD[Rating], rank: Int, @@ -319,8 +339,8 @@ object ALS { * @param rank number of features to use * @param iterations number of iterations of ALS (recommended: 10-20) * @param lambda regularization factor (recommended: 0.01) - * @since 0.8.0 */ + @Since("0.8.0") def train(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double) : MatrixFactorizationModel = { train(ratings, rank, iterations, lambda, -1) @@ -336,8 +356,8 @@ object ALS { * @param ratings RDD of (userID, productID, rating) pairs * @param rank number of features to use * @param iterations number of iterations of ALS (recommended: 10-20) - * @since 0.8.0 */ + @Since("0.8.0") def train(ratings: RDD[Rating], rank: Int, iterations: Int) : MatrixFactorizationModel = { train(ratings, rank, iterations, 0.01, -1) @@ -357,8 +377,8 @@ object ALS { * @param blocks level of parallelism to split computation into * @param alpha confidence parameter * @param seed random seed - * @since 0.8.1 */ + @Since("0.8.1") def trainImplicit( ratings: RDD[Rating], rank: Int, @@ -384,8 +404,8 @@ object ALS { * @param lambda regularization factor (recommended: 0.01) * @param blocks level of parallelism to split computation into * @param alpha confidence parameter - * @since 0.8.1 */ + @Since("0.8.1") def trainImplicit( ratings: RDD[Rating], rank: Int, @@ -409,8 +429,8 @@ object ALS { * @param iterations number of iterations of ALS (recommended: 10-20) * @param lambda regularization factor (recommended: 0.01) * @param alpha confidence parameter - * @since 0.8.1 */ + @Since("0.8.1") def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double, alpha: Double) : MatrixFactorizationModel = { trainImplicit(ratings, rank, iterations, lambda, -1, alpha) @@ -427,8 +447,8 @@ object ALS { * @param ratings RDD of (userID, productID, rating) pairs * @param rank number of features to use * @param iterations number of iterations of ALS (recommended: 10-20) - * @since 0.8.1 */ + @Since("0.8.1") def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int) : MatrixFactorizationModel = { trainImplicit(ratings, rank, iterations, 0.01, -1, 1.0) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala index 261ca9cef0c5..46562eb2ad0f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala @@ -30,6 +30,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.{Logging, SparkContext} +import org.apache.spark.annotation.Since import org.apache.spark.api.java.{JavaPairRDD, JavaRDD} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ @@ -49,12 +50,12 @@ import org.apache.spark.storage.StorageLevel * the features computed for this user. * @param productFeatures RDD of tuples where each tuple represents the productId * and the features computed for this product. - * @since 0.8.0 */ -class MatrixFactorizationModel( - val rank: Int, - val userFeatures: RDD[(Int, Array[Double])], - val productFeatures: RDD[(Int, Array[Double])]) +@Since("0.8.0") +class MatrixFactorizationModel @Since("0.8.0") ( + @Since("0.8.0") val rank: Int, + @Since("0.8.0") val userFeatures: RDD[(Int, Array[Double])], + @Since("0.8.0") val productFeatures: RDD[(Int, Array[Double])]) extends Saveable with Serializable with Logging { require(rank > 0) @@ -74,9 +75,8 @@ class MatrixFactorizationModel( } } - /** Predict the rating of one user for one product. - * @since 0.8.0 - */ + /** Predict the rating of one user for one product. */ + @Since("0.8.0") def predict(user: Int, product: Int): Double = { val userVector = userFeatures.lookup(user).head val productVector = productFeatures.lookup(product).head @@ -114,8 +114,8 @@ class MatrixFactorizationModel( * * @param usersProducts RDD of (user, product) pairs. * @return RDD of Ratings. - * @since 0.9.0 */ + @Since("0.9.0") def predict(usersProducts: RDD[(Int, Int)]): RDD[Rating] = { // Previously the partitions of ratings are only based on the given products. // So if the usersProducts given for prediction contains only few products or @@ -146,8 +146,8 @@ class MatrixFactorizationModel( /** * Java-friendly version of [[MatrixFactorizationModel.predict]]. - * @since 1.2.0 */ + @Since("1.2.0") def predict(usersProducts: JavaPairRDD[JavaInteger, JavaInteger]): JavaRDD[Rating] = { predict(usersProducts.rdd.asInstanceOf[RDD[(Int, Int)]]).toJavaRDD() } @@ -162,8 +162,8 @@ class MatrixFactorizationModel( * by score, decreasing. The first returned is the one predicted to be most strongly * recommended to the user. The score is an opaque value that indicates how strongly * recommended the product is. - * @since 1.1.0 */ + @Since("1.1.0") def recommendProducts(user: Int, num: Int): Array[Rating] = MatrixFactorizationModel.recommend(userFeatures.lookup(user).head, productFeatures, num) .map(t => Rating(user, t._1, t._2)) @@ -179,8 +179,8 @@ class MatrixFactorizationModel( * by score, decreasing. The first returned is the one predicted to be most strongly * recommended to the product. The score is an opaque value that indicates how strongly * recommended the user is. - * @since 1.1.0 */ + @Since("1.1.0") def recommendUsers(product: Int, num: Int): Array[Rating] = MatrixFactorizationModel.recommend(productFeatures.lookup(product).head, userFeatures, num) .map(t => Rating(t._1, product, t._2)) @@ -199,8 +199,8 @@ class MatrixFactorizationModel( * @param sc Spark context used to save model data. * @param path Path specifying the directory in which to save this model. * If the directory already exists, this method throws an exception. - * @since 1.3.0 */ + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { MatrixFactorizationModel.SaveLoadV1_0.save(this, path) } @@ -212,8 +212,8 @@ class MatrixFactorizationModel( * @return [(Int, Array[Rating])] objects, where every tuple contains a userID and an array of * rating objects which contains the same userId, recommended productID and a "score" in the * rating field. Semantics of score is same as recommendProducts API - * @since 1.4.0 */ + @Since("1.4.0") def recommendProductsForUsers(num: Int): RDD[(Int, Array[Rating])] = { MatrixFactorizationModel.recommendForAll(rank, userFeatures, productFeatures, num).map { case (user, top) => @@ -230,8 +230,8 @@ class MatrixFactorizationModel( * @return [(Int, Array[Rating])] objects, where every tuple contains a productID and an array * of rating objects which contains the recommended userId, same productID and a "score" in the * rating field. Semantics of score is same as recommendUsers API - * @since 1.4.0 */ + @Since("1.4.0") def recommendUsersForProducts(num: Int): RDD[(Int, Array[Rating])] = { MatrixFactorizationModel.recommendForAll(rank, productFeatures, userFeatures, num).map { case (product, top) => @@ -241,9 +241,7 @@ class MatrixFactorizationModel( } } -/** - * @since 1.3.0 - */ +@Since("1.3.0") object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] { import org.apache.spark.mllib.util.Loader._ @@ -326,8 +324,8 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] { * @param sc Spark context used for loading model files. * @param path Path specifying the directory to which the model was saved. * @return Model instance - * @since 1.3.0 */ + @Since("1.3.0") override def load(sc: SparkContext, path: String): MatrixFactorizationModel = { val (loadedClassName, formatVersion, _) = loadMetadata(sc, path) val classNameV1_0 = SaveLoadV1_0.thisClassName diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 6709bd79bc82..7e3b4d5648fe 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.regression -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.feature.StandardScaler import org.apache.spark.{Logging, SparkException} import org.apache.spark.rdd.RDD @@ -34,9 +34,13 @@ import org.apache.spark.storage.StorageLevel * * @param weights Weights computed for every feature. * @param intercept Intercept computed for this model. + * */ +@Since("0.8.0") @DeveloperApi -abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double) +abstract class GeneralizedLinearModel @Since("1.0.0") ( + @Since("1.0.0") val weights: Vector, + @Since("0.8.0") val intercept: Double) extends Serializable { /** @@ -53,7 +57,9 @@ abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double * * @param testData RDD representing data points to be predicted * @return RDD[Double] where each entry contains the corresponding prediction + * */ + @Since("1.0.0") def predict(testData: RDD[Vector]): RDD[Double] = { // A small optimization to avoid serializing the entire model. Only the weightsMatrix // and intercept is needed. @@ -71,7 +77,9 @@ abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double * * @param testData array representing a single data point * @return Double prediction from the trained model + * */ + @Since("1.0.0") def predict(testData: Vector): Double = { predictPoint(testData, weights, intercept) } @@ -88,14 +96,20 @@ abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double * :: DeveloperApi :: * GeneralizedLinearAlgorithm implements methods to train a Generalized Linear Model (GLM). * This class should be extended with an Optimizer to create a new GLM. + * */ +@Since("0.8.0") @DeveloperApi abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] extends Logging with Serializable { protected val validators: Seq[RDD[LabeledPoint] => Boolean] = List() - /** The optimizer to solve the problem. */ + /** + * The optimizer to solve the problem. + * + */ + @Since("0.8.0") def optimizer: Optimizer /** Whether to add intercept (default: false). */ @@ -130,7 +144,9 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] /** * The dimension of training features. + * */ + @Since("1.4.0") def getNumFeatures: Int = this.numFeatures /** @@ -153,13 +169,17 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] /** * Get if the algorithm uses addIntercept + * */ + @Since("1.4.0") def isAddIntercept: Boolean = this.addIntercept /** * Set if the algorithm should add an intercept. Default false. * We set the default to false because adding the intercept will cause memory allocation. + * */ + @Since("0.8.0") def setIntercept(addIntercept: Boolean): this.type = { this.addIntercept = addIntercept this @@ -167,7 +187,9 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] /** * Set if the algorithm should validate data before training. Default true. + * */ + @Since("0.8.0") def setValidateData(validateData: Boolean): this.type = { this.validateData = validateData this @@ -176,7 +198,9 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] /** * Run the algorithm with the configured parameters on an input * RDD of LabeledPoint entries. + * */ + @Since("0.8.0") def run(input: RDD[LabeledPoint]): M = { if (numFeatures < 0) { numFeatures = input.map(_.features.size).first() @@ -208,7 +232,9 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] /** * Run the algorithm with the configured parameters on an input RDD * of LabeledPoint entries starting from the initial weights provided. + * */ + @Since("1.0.0") def run(input: RDD[LabeledPoint], initialWeights: Vector): M = { if (numFeatures < 0) { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala index f3b46c75c05f..877d31ba4130 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala @@ -29,7 +29,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.{Loader, Saveable} @@ -46,12 +46,14 @@ import org.apache.spark.sql.SQLContext * @param predictions Array of predictions associated to the boundaries at the same index. * Results of isotonic regression and therefore monotone. * @param isotonic indicates whether this is isotonic or antitonic. + * */ +@Since("1.3.0") @Experimental -class IsotonicRegressionModel ( - val boundaries: Array[Double], - val predictions: Array[Double], - val isotonic: Boolean) extends Serializable with Saveable { +class IsotonicRegressionModel @Since("1.3.0") ( + @Since("1.3.0") val boundaries: Array[Double], + @Since("1.3.0") val predictions: Array[Double], + @Since("1.3.0") val isotonic: Boolean) extends Serializable with Saveable { private val predictionOrd = if (isotonic) Ordering[Double] else Ordering[Double].reverse @@ -59,7 +61,10 @@ class IsotonicRegressionModel ( assertOrdered(boundaries) assertOrdered(predictions)(predictionOrd) - /** A Java-friendly constructor that takes two Iterable parameters and one Boolean parameter. */ + /** + * A Java-friendly constructor that takes two Iterable parameters and one Boolean parameter. + */ + @Since("1.4.0") def this(boundaries: java.lang.Iterable[Double], predictions: java.lang.Iterable[Double], isotonic: java.lang.Boolean) = { @@ -83,7 +88,9 @@ class IsotonicRegressionModel ( * * @param testData Features to be labeled. * @return Predicted labels. + * */ + @Since("1.3.0") def predict(testData: RDD[Double]): RDD[Double] = { testData.map(predict) } @@ -94,7 +101,9 @@ class IsotonicRegressionModel ( * * @param testData Features to be labeled. * @return Predicted labels. + * */ + @Since("1.3.0") def predict(testData: JavaDoubleRDD): JavaDoubleRDD = { JavaDoubleRDD.fromRDD(predict(testData.rdd.retag.asInstanceOf[RDD[Double]])) } @@ -114,7 +123,9 @@ class IsotonicRegressionModel ( * 3) If testData falls between two values in boundary array then prediction is treated * as piecewise linear function and interpolated value is returned. In case there are * multiple values with the same boundary then the same rules as in 2) are used. + * */ + @Since("1.3.0") def predict(testData: Double): Double = { def linearInterpolation(x1: Double, y1: Double, x2: Double, y2: Double, x: Double): Double = { @@ -148,6 +159,7 @@ class IsotonicRegressionModel ( /** A convenient method for boundaries called by the Python API. */ private[mllib] def predictionVector: Vector = Vectors.dense(predictions) + @Since("1.4.0") override def save(sc: SparkContext, path: String): Unit = { IsotonicRegressionModel.SaveLoadV1_0.save(sc, path, boundaries, predictions, isotonic) } @@ -155,6 +167,7 @@ class IsotonicRegressionModel ( override protected def formatVersion: String = "1.0" } +@Since("1.4.0") object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { import org.apache.spark.mllib.util.Loader._ @@ -200,6 +213,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { } } + @Since("1.4.0") override def load(sc: SparkContext, path: String): IsotonicRegressionModel = { implicit val formats = DefaultFormats val (loadedClassName, version, metadata) = loadMetadata(sc, path) @@ -239,6 +253,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]] */ @Experimental +@Since("1.3.0") class IsotonicRegression private (private var isotonic: Boolean) extends Serializable { /** @@ -246,6 +261,7 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali * * @return New instance of IsotonicRegression. */ + @Since("1.3.0") def this() = this(true) /** @@ -254,6 +270,7 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali * @param isotonic Isotonic (increasing) or antitonic (decreasing) sequence. * @return This instance of IsotonicRegression. */ + @Since("1.3.0") def setIsotonic(isotonic: Boolean): this.type = { this.isotonic = isotonic this @@ -269,6 +286,7 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali * the algorithm is executed. * @return Isotonic regression model. */ + @Since("1.3.0") def run(input: RDD[(Double, Double, Double)]): IsotonicRegressionModel = { val preprocessedInput = if (isotonic) { input @@ -294,6 +312,7 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali * the algorithm is executed. * @return Isotonic regression model. */ + @Since("1.3.0") def run(input: JavaRDD[(JDouble, JDouble, JDouble)]): IsotonicRegressionModel = { run(input.rdd.retag.asInstanceOf[RDD[(Double, Double, Double)]]) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala index d5fea822ad77..c284ad232537 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala @@ -19,6 +19,7 @@ package org.apache.spark.mllib.regression import scala.beans.BeanInfo +import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @@ -29,8 +30,11 @@ import org.apache.spark.SparkException * @param label Label for this data point. * @param features List of features for this data point. */ +@Since("0.8.0") @BeanInfo -case class LabeledPoint(label: Double, features: Vector) { +case class LabeledPoint @Since("1.0.0") ( + @Since("0.8.0") label: Double, + @Since("1.0.0") features: Vector) { override def toString: String = { s"($label,$features)" } @@ -38,12 +42,16 @@ case class LabeledPoint(label: Double, features: Vector) { /** * Parser for [[org.apache.spark.mllib.regression.LabeledPoint]]. + * */ +@Since("1.1.0") object LabeledPoint { /** * Parses a string resulted from `LabeledPoint#toString` into * an [[org.apache.spark.mllib.regression.LabeledPoint]]. + * */ + @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala index 4f482384f0f3..a9aba173fa0e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala @@ -18,6 +18,7 @@ package org.apache.spark.mllib.regression import org.apache.spark.SparkContext +import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.pmml.PMMLExportable @@ -30,10 +31,12 @@ import org.apache.spark.rdd.RDD * * @param weights Weights computed for every feature. * @param intercept Intercept computed for this model. + * */ -class LassoModel ( - override val weights: Vector, - override val intercept: Double) +@Since("0.8.0") +class LassoModel @Since("1.1.0") ( + @Since("1.0.0") override val weights: Vector, + @Since("0.8.0") override val intercept: Double) extends GeneralizedLinearModel(weights, intercept) with RegressionModel with Serializable with Saveable with PMMLExportable { @@ -44,6 +47,7 @@ class LassoModel ( weightMatrix.toBreeze.dot(dataMatrix.toBreeze) + intercept } + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { GLMRegressionModel.SaveLoadV1_0.save(sc, path, this.getClass.getName, weights, intercept) } @@ -51,8 +55,10 @@ class LassoModel ( override protected def formatVersion: String = "1.0" } +@Since("1.3.0") object LassoModel extends Loader[LassoModel] { + @Since("1.3.0") override def load(sc: SparkContext, path: String): LassoModel = { val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) // Hard-code class name string in case it changes in the future @@ -78,6 +84,7 @@ object LassoModel extends Loader[LassoModel] { * its corresponding right hand side label y. * See also the documentation for the precise formulation. */ +@Since("0.8.0") class LassoWithSGD private ( private var stepSize: Double, private var numIterations: Int, @@ -87,6 +94,7 @@ class LassoWithSGD private ( private val gradient = new LeastSquaresGradient() private val updater = new L1Updater() + @Since("0.8.0") override val optimizer = new GradientDescent(gradient, updater) .setStepSize(stepSize) .setNumIterations(numIterations) @@ -97,6 +105,7 @@ class LassoWithSGD private ( * Construct a Lasso object with default parameters: {stepSize: 1.0, numIterations: 100, * regParam: 0.01, miniBatchFraction: 1.0}. */ + @Since("0.8.0") def this() = this(1.0, 100, 0.01, 1.0) override protected def createModel(weights: Vector, intercept: Double) = { @@ -106,7 +115,9 @@ class LassoWithSGD private ( /** * Top-level methods for calling Lasso. + * */ +@Since("0.8.0") object LassoWithSGD { /** @@ -123,7 +134,9 @@ object LassoWithSGD { * @param miniBatchFraction Fraction of data to be used per iteration. * @param initialWeights Initial set of weights to be used. Array should be equal in size to * the number of features in the data. + * */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -146,7 +159,9 @@ object LassoWithSGD { * @param stepSize Step size to be used for each iteration of gradient descent. * @param regParam Regularization parameter. * @param miniBatchFraction Fraction of data to be used per iteration. + * */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -167,7 +182,9 @@ object LassoWithSGD { * @param regParam Regularization parameter. * @param numIterations Number of iterations of gradient descent to run. * @return a LassoModel which has the weights and offset from training. + * */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -185,7 +202,9 @@ object LassoWithSGD { * matrix A as well as the corresponding right hand side label y * @param numIterations Number of iterations of gradient descent to run. * @return a LassoModel which has the weights and offset from training. + * */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int): LassoModel = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala index 9453c4f66c21..4996ace5df85 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala @@ -18,6 +18,7 @@ package org.apache.spark.mllib.regression import org.apache.spark.SparkContext +import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.pmml.PMMLExportable @@ -30,10 +31,12 @@ import org.apache.spark.rdd.RDD * * @param weights Weights computed for every feature. * @param intercept Intercept computed for this model. + * */ -class LinearRegressionModel ( - override val weights: Vector, - override val intercept: Double) +@Since("0.8.0") +class LinearRegressionModel @Since("1.1.0") ( + @Since("1.0.0") override val weights: Vector, + @Since("0.8.0") override val intercept: Double) extends GeneralizedLinearModel(weights, intercept) with RegressionModel with Serializable with Saveable with PMMLExportable { @@ -44,6 +47,7 @@ class LinearRegressionModel ( weightMatrix.toBreeze.dot(dataMatrix.toBreeze) + intercept } + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { GLMRegressionModel.SaveLoadV1_0.save(sc, path, this.getClass.getName, weights, intercept) } @@ -51,8 +55,10 @@ class LinearRegressionModel ( override protected def formatVersion: String = "1.0" } +@Since("1.3.0") object LinearRegressionModel extends Loader[LinearRegressionModel] { + @Since("1.3.0") override def load(sc: SparkContext, path: String): LinearRegressionModel = { val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) // Hard-code class name string in case it changes in the future @@ -79,6 +85,7 @@ object LinearRegressionModel extends Loader[LinearRegressionModel] { * its corresponding right hand side label y. * See also the documentation for the precise formulation. */ +@Since("0.8.0") class LinearRegressionWithSGD private[mllib] ( private var stepSize: Double, private var numIterations: Int, @@ -87,6 +94,7 @@ class LinearRegressionWithSGD private[mllib] ( private val gradient = new LeastSquaresGradient() private val updater = new SimpleUpdater() + @Since("0.8.0") override val optimizer = new GradientDescent(gradient, updater) .setStepSize(stepSize) .setNumIterations(numIterations) @@ -96,6 +104,7 @@ class LinearRegressionWithSGD private[mllib] ( * Construct a LinearRegression object with default parameters: {stepSize: 1.0, * numIterations: 100, miniBatchFraction: 1.0}. */ + @Since("0.8.0") def this() = this(1.0, 100, 1.0) override protected[mllib] def createModel(weights: Vector, intercept: Double) = { @@ -105,7 +114,9 @@ class LinearRegressionWithSGD private[mllib] ( /** * Top-level methods for calling LinearRegression. + * */ +@Since("0.8.0") object LinearRegressionWithSGD { /** @@ -121,7 +132,9 @@ object LinearRegressionWithSGD { * @param miniBatchFraction Fraction of data to be used per iteration. * @param initialWeights Initial set of weights to be used. Array should be equal in size to * the number of features in the data. + * */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -142,7 +155,9 @@ object LinearRegressionWithSGD { * @param numIterations Number of iterations of gradient descent to run. * @param stepSize Step size to be used for each iteration of gradient descent. * @param miniBatchFraction Fraction of data to be used per iteration. + * */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -161,7 +176,9 @@ object LinearRegressionWithSGD { * @param stepSize Step size to be used for each iteration of Gradient Descent. * @param numIterations Number of iterations of gradient descent to run. * @return a LinearRegressionModel which has the weights and offset from training. + * */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -178,7 +195,9 @@ object LinearRegressionWithSGD { * matrix A as well as the corresponding right hand side label y * @param numIterations Number of iterations of gradient descent to run. * @return a LinearRegressionModel which has the weights and offset from training. + * */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int): LinearRegressionModel = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala index 214ac4d0ed7d..0e72d6591ce8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala @@ -19,11 +19,12 @@ package org.apache.spark.mllib.regression import org.json4s.{DefaultFormats, JValue} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD +@Since("0.8.0") @Experimental trait RegressionModel extends Serializable { /** @@ -31,7 +32,9 @@ trait RegressionModel extends Serializable { * * @param testData RDD representing data points to be predicted * @return RDD[Double] where each entry contains the corresponding prediction + * */ + @Since("1.0.0") def predict(testData: RDD[Vector]): RDD[Double] /** @@ -39,14 +42,18 @@ trait RegressionModel extends Serializable { * * @param testData array representing a single data point * @return Double prediction from the trained model + * */ + @Since("1.0.0") def predict(testData: Vector): Double /** * Predict values for examples stored in a JavaRDD. * @param testData JavaRDD representing data points to be predicted * @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction + * */ + @Since("1.0.0") def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] = predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]] } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala index 7d28ffad45c9..0a44ff559d55 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala @@ -18,6 +18,7 @@ package org.apache.spark.mllib.regression import org.apache.spark.SparkContext +import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.pmml.PMMLExportable @@ -31,10 +32,12 @@ import org.apache.spark.rdd.RDD * * @param weights Weights computed for every feature. * @param intercept Intercept computed for this model. + * */ -class RidgeRegressionModel ( - override val weights: Vector, - override val intercept: Double) +@Since("0.8.0") +class RidgeRegressionModel @Since("1.1.0") ( + @Since("1.0.0") override val weights: Vector, + @Since("0.8.0") override val intercept: Double) extends GeneralizedLinearModel(weights, intercept) with RegressionModel with Serializable with Saveable with PMMLExportable { @@ -45,6 +48,7 @@ class RidgeRegressionModel ( weightMatrix.toBreeze.dot(dataMatrix.toBreeze) + intercept } + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { GLMRegressionModel.SaveLoadV1_0.save(sc, path, this.getClass.getName, weights, intercept) } @@ -52,8 +56,10 @@ class RidgeRegressionModel ( override protected def formatVersion: String = "1.0" } +@Since("1.3.0") object RidgeRegressionModel extends Loader[RidgeRegressionModel] { + @Since("1.3.0") override def load(sc: SparkContext, path: String): RidgeRegressionModel = { val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) // Hard-code class name string in case it changes in the future @@ -79,6 +85,7 @@ object RidgeRegressionModel extends Loader[RidgeRegressionModel] { * its corresponding right hand side label y. * See also the documentation for the precise formulation. */ +@Since("0.8.0") class RidgeRegressionWithSGD private ( private var stepSize: Double, private var numIterations: Int, @@ -88,7 +95,7 @@ class RidgeRegressionWithSGD private ( private val gradient = new LeastSquaresGradient() private val updater = new SquaredL2Updater() - + @Since("0.8.0") override val optimizer = new GradientDescent(gradient, updater) .setStepSize(stepSize) .setNumIterations(numIterations) @@ -99,6 +106,7 @@ class RidgeRegressionWithSGD private ( * Construct a RidgeRegression object with default parameters: {stepSize: 1.0, numIterations: 100, * regParam: 0.01, miniBatchFraction: 1.0}. */ + @Since("0.8.0") def this() = this(1.0, 100, 0.01, 1.0) override protected def createModel(weights: Vector, intercept: Double) = { @@ -108,7 +116,9 @@ class RidgeRegressionWithSGD private ( /** * Top-level methods for calling RidgeRegression. + * */ +@Since("0.8.0") object RidgeRegressionWithSGD { /** @@ -124,7 +134,9 @@ object RidgeRegressionWithSGD { * @param miniBatchFraction Fraction of data to be used per iteration. * @param initialWeights Initial set of weights to be used. Array should be equal in size to * the number of features in the data. + * */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -146,7 +158,9 @@ object RidgeRegressionWithSGD { * @param stepSize Step size to be used for each iteration of gradient descent. * @param regParam Regularization parameter. * @param miniBatchFraction Fraction of data to be used per iteration. + * */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -166,7 +180,9 @@ object RidgeRegressionWithSGD { * @param regParam Regularization parameter. * @param numIterations Number of iterations of gradient descent to run. * @return a RidgeRegressionModel which has the weights and offset from training. + * */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -183,7 +199,9 @@ object RidgeRegressionWithSGD { * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. * @return a RidgeRegressionModel which has the weights and offset from training. + * */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int): RidgeRegressionModel = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala index 141052ba813e..73948b2d9851 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala @@ -20,9 +20,9 @@ package org.apache.spark.mllib.regression import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.api.java.JavaSparkContext.fakeClassTag -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.Vector import org.apache.spark.streaming.api.java.{JavaDStream, JavaPairDStream} import org.apache.spark.streaming.dstream.DStream @@ -53,7 +53,9 @@ import org.apache.spark.streaming.dstream.DStream * It is also ok to call trainOn on different streams; this will update * the model using each of the different sources, in sequence. * + * */ +@Since("1.1.0") @DeveloperApi abstract class StreamingLinearAlgorithm[ M <: GeneralizedLinearModel, @@ -65,7 +67,11 @@ abstract class StreamingLinearAlgorithm[ /** The algorithm to use for updating. */ protected val algorithm: A - /** Return the latest model. */ + /** + * Return the latest model. + * + */ + @Since("1.1.0") def latestModel(): M = { model.get } @@ -78,6 +84,7 @@ abstract class StreamingLinearAlgorithm[ * * @param data DStream containing labeled data */ + @Since("1.1.0") def trainOn(data: DStream[LabeledPoint]): Unit = { if (model.isEmpty) { throw new IllegalArgumentException("Model must be initialized before starting training.") @@ -95,7 +102,10 @@ abstract class StreamingLinearAlgorithm[ } } - /** Java-friendly version of `trainOn`. */ + /** + * Java-friendly version of `trainOn`. + */ + @Since("1.3.0") def trainOn(data: JavaDStream[LabeledPoint]): Unit = trainOn(data.dstream) /** @@ -103,7 +113,9 @@ abstract class StreamingLinearAlgorithm[ * * @param data DStream containing feature vectors * @return DStream containing predictions + * */ + @Since("1.1.0") def predictOn(data: DStream[Vector]): DStream[Double] = { if (model.isEmpty) { throw new IllegalArgumentException("Model must be initialized before starting prediction.") @@ -111,7 +123,11 @@ abstract class StreamingLinearAlgorithm[ data.map{x => model.get.predict(x)} } - /** Java-friendly version of `predictOn`. */ + /** + * Java-friendly version of `predictOn`. + * + */ + @Since("1.3.0") def predictOn(data: JavaDStream[Vector]): JavaDStream[java.lang.Double] = { JavaDStream.fromDStream(predictOn(data.dstream).asInstanceOf[DStream[java.lang.Double]]) } @@ -121,7 +137,9 @@ abstract class StreamingLinearAlgorithm[ * @param data DStream containing feature vectors * @tparam K key type * @return DStream containing the input keys and the predictions as values + * */ + @Since("1.1.0") def predictOnValues[K: ClassTag](data: DStream[(K, Vector)]): DStream[(K, Double)] = { if (model.isEmpty) { throw new IllegalArgumentException("Model must be initialized before starting prediction") @@ -130,7 +148,11 @@ abstract class StreamingLinearAlgorithm[ } - /** Java-friendly version of `predictOnValues`. */ + /** + * Java-friendly version of `predictOnValues`. + * + */ + @Since("1.3.0") def predictOnValues[K](data: JavaPairDStream[K, Vector]): JavaPairDStream[K, java.lang.Double] = { implicit val tag = fakeClassTag[K] JavaPairDStream.fromPairDStream( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala index c6d04464a12b..fe1d487cdd07 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.regression -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.Vector /** @@ -39,9 +39,9 @@ import org.apache.spark.mllib.linalg.Vector * .setNumIterations(10) * .setInitialWeights(Vectors.dense(...)) * .trainOn(DStream) - * */ @Experimental +@Since("1.1.0") class StreamingLinearRegressionWithSGD private[mllib] ( private var stepSize: Double, private var numIterations: Int, @@ -55,40 +55,56 @@ class StreamingLinearRegressionWithSGD private[mllib] ( * Initial weights must be set before using trainOn or predictOn * (see `StreamingLinearAlgorithm`) */ + @Since("1.1.0") def this() = this(0.1, 50, 1.0) + @Since("1.1.0") val algorithm = new LinearRegressionWithSGD(stepSize, numIterations, miniBatchFraction) protected var model: Option[LinearRegressionModel] = None - /** Set the step size for gradient descent. Default: 0.1. */ + /** + * Set the step size for gradient descent. Default: 0.1. + */ + @Since("1.1.0") def setStepSize(stepSize: Double): this.type = { this.algorithm.optimizer.setStepSize(stepSize) this } - /** Set the number of iterations of gradient descent to run per update. Default: 50. */ + /** + * Set the number of iterations of gradient descent to run per update. Default: 50. + */ + @Since("1.1.0") def setNumIterations(numIterations: Int): this.type = { this.algorithm.optimizer.setNumIterations(numIterations) this } - /** Set the fraction of each batch to use for updates. Default: 1.0. */ + /** + * Set the fraction of each batch to use for updates. Default: 1.0. + */ + @Since("1.1.0") def setMiniBatchFraction(miniBatchFraction: Double): this.type = { this.algorithm.optimizer.setMiniBatchFraction(miniBatchFraction) this } - /** Set the initial weights. */ + /** + * Set the initial weights. + */ + @Since("1.1.0") def setInitialWeights(initialWeights: Vector): this.type = { this.model = Some(algorithm.createModel(initialWeights, 0.0)) this } - /** Set the convergence tolerance. */ + /** + * Set the convergence tolerance. Default: 0.001. + */ + @Since("1.5.0") def setConvergenceTol(tolerance: Double): this.type = { this.algorithm.optimizer.setConvergenceTol(tolerance) this } - } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala index 93a6753efd4d..4a856f7f3434 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.stat import com.github.fommil.netlib.BLAS.{getInstance => blas} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD @@ -37,8 +37,8 @@ import org.apache.spark.rdd.RDD * .setBandwidth(3.0) * val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) * }}} - * @since 1.4.0 */ +@Since("1.4.0") @Experimental class KernelDensity extends Serializable { @@ -52,8 +52,8 @@ class KernelDensity extends Serializable { /** * Sets the bandwidth (standard deviation) of the Gaussian kernel (default: `1.0`). - * @since 1.4.0 */ + @Since("1.4.0") def setBandwidth(bandwidth: Double): this.type = { require(bandwidth > 0, s"Bandwidth must be positive, but got $bandwidth.") this.bandwidth = bandwidth @@ -62,8 +62,8 @@ class KernelDensity extends Serializable { /** * Sets the sample to use for density estimation. - * @since 1.4.0 */ + @Since("1.4.0") def setSample(sample: RDD[Double]): this.type = { this.sample = sample this @@ -71,8 +71,8 @@ class KernelDensity extends Serializable { /** * Sets the sample to use for density estimation (for Java users). - * @since 1.4.0 */ + @Since("1.4.0") def setSample(sample: JavaRDD[java.lang.Double]): this.type = { this.sample = sample.rdd.asInstanceOf[RDD[Double]] this @@ -80,8 +80,8 @@ class KernelDensity extends Serializable { /** * Estimates probability density function at the given array of points. - * @since 1.4.0 */ + @Since("1.4.0") def estimate(points: Array[Double]): Array[Double] = { val sample = this.sample val bandwidth = this.bandwidth diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala index 62da9f2ef22a..51b713e263e0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.stat -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Vectors, Vector} /** @@ -33,8 +33,8 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector} * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]] * Zero elements (including explicit zero values) are skipped when calling add(), * to have time complexity O(nnz) instead of O(n) for each column. - * @since 1.1.0 */ +@Since("1.1.0") @DeveloperApi class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with Serializable { @@ -53,8 +53,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S * * @param sample The sample in dense/sparse vector format to be added into this summarizer. * @return This MultivariateOnlineSummarizer object. - * @since 1.1.0 */ + @Since("1.1.0") def add(sample: Vector): this.type = { if (n == 0) { require(sample.size > 0, s"Vector should have dimension larger than zero.") @@ -109,8 +109,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S * * @param other The other MultivariateOnlineSummarizer to be merged. * @return This MultivariateOnlineSummarizer object. - * @since 1.1.0 */ + @Since("1.1.0") def merge(other: MultivariateOnlineSummarizer): this.type = { if (this.totalCnt != 0 && other.totalCnt != 0) { require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " + @@ -153,8 +153,10 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** - * @since 1.1.0 + * Sample mean of each dimension. + * */ + @Since("1.1.0") override def mean: Vector = { require(totalCnt > 0, s"Nothing has been added to this summarizer.") @@ -168,8 +170,10 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** - * @since 1.1.0 + * Sample variance of each dimension. + * */ + @Since("1.1.0") override def variance: Vector = { require(totalCnt > 0, s"Nothing has been added to this summarizer.") @@ -193,13 +197,17 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** - * @since 1.1.0 + * Sample size. + * */ + @Since("1.1.0") override def count: Long = totalCnt /** - * @since 1.1.0 + * Number of nonzero elements in each dimension. + * */ + @Since("1.1.0") override def numNonzeros: Vector = { require(totalCnt > 0, s"Nothing has been added to this summarizer.") @@ -207,8 +215,10 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** - * @since 1.1.0 + * Maximum value of each dimension. + * */ + @Since("1.1.0") override def max: Vector = { require(totalCnt > 0, s"Nothing has been added to this summarizer.") @@ -221,8 +231,10 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** - * @since 1.1.0 + * Minimum value of each dimension. + * */ + @Since("1.1.0") override def min: Vector = { require(totalCnt > 0, s"Nothing has been added to this summarizer.") @@ -235,8 +247,10 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** - * @since 1.2.0 + * L2 (Euclidian) norm of each dimension. + * */ + @Since("1.2.0") override def normL2: Vector = { require(totalCnt > 0, s"Nothing has been added to this summarizer.") @@ -252,8 +266,10 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** - * @since 1.2.0 + * L1 norm of each dimension. + * */ + @Since("1.2.0") override def normL1: Vector = { require(totalCnt > 0, s"Nothing has been added to this summarizer.") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala index 3bb49f12289e..39a16fb743d6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala @@ -17,59 +17,60 @@ package org.apache.spark.mllib.stat +import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.Vector /** * Trait for multivariate statistical summary of a data matrix. - * @since 1.0.0 */ +@Since("1.0.0") trait MultivariateStatisticalSummary { /** * Sample mean vector. - * @since 1.0.0 */ + @Since("1.0.0") def mean: Vector /** * Sample variance vector. Should return a zero vector if the sample size is 1. - * @since 1.0.0 */ + @Since("1.0.0") def variance: Vector /** * Sample size. - * @since 1.0.0 */ + @Since("1.0.0") def count: Long /** * Number of nonzero elements (including explicitly presented zero values) in each column. - * @since 1.0.0 */ + @Since("1.0.0") def numNonzeros: Vector /** * Maximum value of each column. - * @since 1.0.0 */ + @Since("1.0.0") def max: Vector /** * Minimum value of each column. - * @since 1.0.0 */ + @Since("1.0.0") def min: Vector /** * Euclidean magnitude of each column - * @since 1.2.0 */ + @Since("1.2.0") def normL2: Vector /** * L1 norm of each column - * @since 1.2.0 */ + @Since("1.2.0") def normL1: Vector } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala index f84502919e38..84d64a5bfb38 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala @@ -19,8 +19,8 @@ package org.apache.spark.mllib.stat import scala.annotation.varargs -import org.apache.spark.annotation.Experimental -import org.apache.spark.api.java.JavaRDD +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.api.java.{JavaRDD, JavaDoubleRDD} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.{Matrix, Vector} import org.apache.spark.mllib.regression.LabeledPoint @@ -32,8 +32,8 @@ import org.apache.spark.rdd.RDD /** * :: Experimental :: * API for statistical functions in MLlib. - * @since 1.1.0 */ +@Since("1.1.0") @Experimental object Statistics { @@ -42,8 +42,8 @@ object Statistics { * * @param X an RDD[Vector] for which column-wise summary statistics are to be computed. * @return [[MultivariateStatisticalSummary]] object containing column-wise summary statistics. - * @since 1.1.0 */ + @Since("1.1.0") def colStats(X: RDD[Vector]): MultivariateStatisticalSummary = { new RowMatrix(X).computeColumnSummaryStatistics() } @@ -54,8 +54,8 @@ object Statistics { * * @param X an RDD[Vector] for which the correlation matrix is to be computed. * @return Pearson correlation matrix comparing columns in X. - * @since 1.1.0 */ + @Since("1.1.0") def corr(X: RDD[Vector]): Matrix = Correlations.corrMatrix(X) /** @@ -71,8 +71,8 @@ object Statistics { * @param method String specifying the method to use for computing correlation. * Supported: `pearson` (default), `spearman` * @return Correlation matrix comparing columns in X. - * @since 1.1.0 */ + @Since("1.1.0") def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method) /** @@ -85,14 +85,14 @@ object Statistics { * @param x RDD[Double] of the same cardinality as y. * @param y RDD[Double] of the same cardinality as x. * @return A Double containing the Pearson correlation between the two input RDD[Double]s - * @since 1.1.0 */ + @Since("1.1.0") def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y) /** * Java-friendly version of [[corr()]] - * @since 1.4.1 */ + @Since("1.4.1") def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double = corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]]) @@ -109,14 +109,14 @@ object Statistics { * Supported: `pearson` (default), `spearman` * @return A Double containing the correlation between the two input RDD[Double]s using the * specified method. - * @since 1.1.0 */ + @Since("1.1.0") def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method) /** * Java-friendly version of [[corr()]] - * @since 1.4.1 */ + @Since("1.4.1") def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double = corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]], method) @@ -133,8 +133,8 @@ object Statistics { * `expected` is rescaled if the `expected` sum differs from the `observed` sum. * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value, * the method used, and the null hypothesis. - * @since 1.1.0 */ + @Since("1.1.0") def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = { ChiSqTest.chiSquared(observed, expected) } @@ -148,8 +148,8 @@ object Statistics { * @param observed Vector containing the observed categorical counts/relative frequencies. * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value, * the method used, and the null hypothesis. - * @since 1.1.0 */ + @Since("1.1.0") def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed) /** @@ -159,8 +159,8 @@ object Statistics { * @param observed The contingency matrix (containing either counts or relative frequencies). * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value, * the method used, and the null hypothesis. - * @since 1.1.0 */ + @Since("1.1.0") def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed) /** @@ -172,12 +172,16 @@ object Statistics { * Real-valued features will be treated as categorical for each distinct value. * @return an array containing the ChiSquaredTestResult for every feature against the label. * The order of the elements in the returned array reflects the order of input features. - * @since 1.1.0 */ + @Since("1.1.0") def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = { ChiSqTest.chiSquaredFeatures(data) } + /** Java-friendly version of [[chiSqTest()]] */ + @Since("1.5.0") + def chiSqTest(data: JavaRDD[LabeledPoint]): Array[ChiSqTestResult] = chiSqTest(data.rdd) + /** * Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a * continuous distribution. By comparing the largest difference between the empirical cumulative @@ -191,6 +195,7 @@ object Statistics { * @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] object containing test * statistic, p-value, and null hypothesis. */ + @Since("1.5.0") def kolmogorovSmirnovTest(data: RDD[Double], cdf: Double => Double) : KolmogorovSmirnovTestResult = { KolmogorovSmirnovTest.testOneSample(data, cdf) @@ -207,9 +212,20 @@ object Statistics { * @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] object containing test * statistic, p-value, and null hypothesis. */ + @Since("1.5.0") @varargs def kolmogorovSmirnovTest(data: RDD[Double], distName: String, params: Double*) : KolmogorovSmirnovTestResult = { KolmogorovSmirnovTest.testOneSample(data, distName, params: _*) } + + /** Java-friendly version of [[kolmogorovSmirnovTest()]] */ + @Since("1.5.0") + @varargs + def kolmogorovSmirnovTest( + data: JavaDoubleRDD, + distName: String, + params: Double*): KolmogorovSmirnovTestResult = { + kolmogorovSmirnovTest(data.rdd.asInstanceOf[RDD[Double]], distName, params: _*) + } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala index 9aa7763d7890..92a5af708d04 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.stat.distribution import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, diag, max, eigSym, Vector => BV} -import org.apache.spark.annotation.DeveloperApi; +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix} import org.apache.spark.mllib.util.MLUtils @@ -32,12 +32,12 @@ import org.apache.spark.mllib.util.MLUtils * * @param mu The mean vector of the distribution * @param sigma The covariance matrix of the distribution - * @since 1.3.0 */ +@Since("1.3.0") @DeveloperApi -class MultivariateGaussian ( - val mu: Vector, - val sigma: Matrix) extends Serializable { +class MultivariateGaussian @Since("1.3.0") ( + @Since("1.3.0") val mu: Vector, + @Since("1.3.0") val sigma: Matrix) extends Serializable { require(sigma.numCols == sigma.numRows, "Covariance matrix must be square") require(mu.size == sigma.numCols, "Mean vector length must match covariance matrix size") @@ -62,15 +62,15 @@ class MultivariateGaussian ( private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants /** Returns density of this multivariate Gaussian at given point, x - * @since 1.3.0 */ + @Since("1.3.0") def pdf(x: Vector): Double = { pdf(x.toBreeze) } /** Returns the log-density of this multivariate Gaussian at given point, x - * @since 1.3.0 */ + @Since("1.3.0") def logpdf(x: Vector): Double = { logpdf(x.toBreeze) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala index f44be1370669..d01b3707be94 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.stat.test -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} /** * :: Experimental :: @@ -25,28 +25,33 @@ import org.apache.spark.annotation.Experimental * @tparam DF Return type of `degreesOfFreedom`. */ @Experimental +@Since("1.1.0") trait TestResult[DF] { /** * The probability of obtaining a test statistic result at least as extreme as the one that was * actually observed, assuming that the null hypothesis is true. */ + @Since("1.1.0") def pValue: Double /** * Returns the degree(s) of freedom of the hypothesis test. * Return type should be Number(e.g. Int, Double) or tuples of Numbers for toString compatibility. */ + @Since("1.1.0") def degreesOfFreedom: DF /** * Test statistic. */ + @Since("1.1.0") def statistic: Double /** * Null hypothesis of the test. */ + @Since("1.1.0") def nullHypothesis: String /** @@ -78,11 +83,12 @@ trait TestResult[DF] { * Object containing the test results for the chi-squared hypothesis test. */ @Experimental +@Since("1.1.0") class ChiSqTestResult private[stat] (override val pValue: Double, - override val degreesOfFreedom: Int, - override val statistic: Double, - val method: String, - override val nullHypothesis: String) extends TestResult[Int] { + @Since("1.1.0") override val degreesOfFreedom: Int, + @Since("1.1.0") override val statistic: Double, + @Since("1.1.0") val method: String, + @Since("1.1.0") override val nullHypothesis: String) extends TestResult[Int] { override def toString: String = { "Chi squared test summary:\n" + @@ -96,11 +102,13 @@ class ChiSqTestResult private[stat] (override val pValue: Double, * Object containing the test results for the Kolmogorov-Smirnov test. */ @Experimental +@Since("1.5.0") class KolmogorovSmirnovTestResult private[stat] ( - override val pValue: Double, - override val statistic: Double, - override val nullHypothesis: String) extends TestResult[Int] { + @Since("1.5.0") override val pValue: Double, + @Since("1.5.0") override val statistic: Double, + @Since("1.5.0") override val nullHypothesis: String) extends TestResult[Int] { + @Since("1.5.0") override val degreesOfFreedom = 0 override def toString: String = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index cecd1fed896d..4a77d4adcd86 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -22,7 +22,7 @@ import scala.collection.mutable import scala.collection.mutable.ArrayBuilder import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.RandomForest.NodeIndexInfo @@ -44,8 +44,10 @@ import org.apache.spark.util.random.XORShiftRandom * of algorithm (classification, regression, etc.), feature type (continuous, * categorical), depth of the tree, quantile calculation strategy, etc. */ +@Since("1.0.0") @Experimental -class DecisionTree (private val strategy: Strategy) extends Serializable with Logging { +class DecisionTree @Since("1.0.0") (private val strategy: Strategy) + extends Serializable with Logging { strategy.assertValid() @@ -54,6 +56,7 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] * @return DecisionTreeModel that can be used for prediction */ + @Since("1.2.0") def run(input: RDD[LabeledPoint]): DecisionTreeModel = { // Note: random seed will not be used since numTrees = 1. val rf = new RandomForest(strategy, numTrees = 1, featureSubsetStrategy = "all", seed = 0) @@ -62,6 +65,7 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo } } +@Since("1.0.0") object DecisionTree extends Serializable with Logging { /** @@ -79,7 +83,8 @@ object DecisionTree extends Serializable with Logging { * of algorithm (classification, regression, etc.), feature type (continuous, * categorical), depth of the tree, quantile calculation strategy, etc. * @return DecisionTreeModel that can be used for prediction - */ + */ + @Since("1.0.0") def train(input: RDD[LabeledPoint], strategy: Strategy): DecisionTreeModel = { new DecisionTree(strategy).run(input) } @@ -101,6 +106,7 @@ object DecisionTree extends Serializable with Logging { * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. * @return DecisionTreeModel that can be used for prediction */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], algo: Algo, @@ -128,6 +134,7 @@ object DecisionTree extends Serializable with Logging { * @param numClasses number of classes for classification. Default value of 2. * @return DecisionTreeModel that can be used for prediction */ + @Since("1.2.0") def train( input: RDD[LabeledPoint], algo: Algo, @@ -161,6 +168,7 @@ object DecisionTree extends Serializable with Logging { * with k categories indexed from 0: {0, 1, ..., k-1}. * @return DecisionTreeModel that can be used for prediction */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], algo: Algo, @@ -193,6 +201,7 @@ object DecisionTree extends Serializable with Logging { * (suggested value: 32) * @return DecisionTreeModel that can be used for prediction */ + @Since("1.1.0") def trainClassifier( input: RDD[LabeledPoint], numClasses: Int, @@ -208,6 +217,7 @@ object DecisionTree extends Serializable with Logging { /** * Java-friendly API for [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]] */ + @Since("1.1.0") def trainClassifier( input: JavaRDD[LabeledPoint], numClasses: Int, @@ -237,6 +247,7 @@ object DecisionTree extends Serializable with Logging { * (suggested value: 32) * @return DecisionTreeModel that can be used for prediction */ + @Since("1.1.0") def trainRegressor( input: RDD[LabeledPoint], categoricalFeaturesInfo: Map[Int, Int], @@ -250,6 +261,7 @@ object DecisionTree extends Serializable with Logging { /** * Java-friendly API for [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]] */ + @Since("1.1.0") def trainRegressor( input: JavaRDD[LabeledPoint], categoricalFeaturesInfo: java.util.Map[java.lang.Integer, java.lang.Integer], diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index 9ce6faa137c4..95ed48cea671 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -18,7 +18,7 @@ package org.apache.spark.mllib.tree import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.impl.PeriodicRDDCheckpointer import org.apache.spark.mllib.regression.LabeledPoint @@ -49,8 +49,9 @@ import org.apache.spark.storage.StorageLevel * * @param boostingStrategy Parameters for the gradient boosting algorithm. */ +@Since("1.2.0") @Experimental -class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) +class GradientBoostedTrees @Since("1.2.0") (private val boostingStrategy: BoostingStrategy) extends Serializable with Logging { /** @@ -58,6 +59,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * @return a gradient boosted trees model that can be used for prediction */ + @Since("1.2.0") def run(input: RDD[LabeledPoint]): GradientBoostedTreesModel = { val algo = boostingStrategy.treeStrategy.algo algo match { @@ -75,6 +77,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) /** * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#run]]. */ + @Since("1.2.0") def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = { run(input.rdd) } @@ -89,6 +92,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) * by using [[org.apache.spark.rdd.RDD.randomSplit()]] * @return a gradient boosted trees model that can be used for prediction */ + @Since("1.4.0") def runWithValidation( input: RDD[LabeledPoint], validationInput: RDD[LabeledPoint]): GradientBoostedTreesModel = { @@ -112,6 +116,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) /** * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation]]. */ + @Since("1.4.0") def runWithValidation( input: JavaRDD[LabeledPoint], validationInput: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = { @@ -119,6 +124,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) } } +@Since("1.2.0") object GradientBoostedTrees extends Logging { /** @@ -130,6 +136,7 @@ object GradientBoostedTrees extends Logging { * @param boostingStrategy Configuration options for the boosting algorithm. * @return a gradient boosted trees model that can be used for prediction */ + @Since("1.2.0") def train( input: RDD[LabeledPoint], boostingStrategy: BoostingStrategy): GradientBoostedTreesModel = { @@ -139,6 +146,7 @@ object GradientBoostedTrees extends Logging { /** * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees$#train]] */ + @Since("1.2.0") def train( input: JavaRDD[LabeledPoint], boostingStrategy: BoostingStrategy): GradientBoostedTreesModel = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 069959976a18..63a902f3eb51 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -23,7 +23,7 @@ import scala.collection.mutable import scala.collection.JavaConverters._ import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.configuration.Strategy @@ -260,6 +260,7 @@ private class RandomForest ( } +@Since("1.2.0") object RandomForest extends Serializable with Logging { /** @@ -277,6 +278,7 @@ object RandomForest extends Serializable with Logging { * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction */ + @Since("1.2.0") def trainClassifier( input: RDD[LabeledPoint], strategy: Strategy, @@ -314,6 +316,7 @@ object RandomForest extends Serializable with Logging { * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction */ + @Since("1.2.0") def trainClassifier( input: RDD[LabeledPoint], numClasses: Int, @@ -333,6 +336,7 @@ object RandomForest extends Serializable with Logging { /** * Java-friendly API for [[org.apache.spark.mllib.tree.RandomForest$#trainClassifier]] */ + @Since("1.2.0") def trainClassifier( input: JavaRDD[LabeledPoint], numClasses: Int, @@ -363,6 +367,7 @@ object RandomForest extends Serializable with Logging { * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction */ + @Since("1.2.0") def trainRegressor( input: RDD[LabeledPoint], strategy: Strategy, @@ -399,6 +404,7 @@ object RandomForest extends Serializable with Logging { * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction */ + @Since("1.2.0") def trainRegressor( input: RDD[LabeledPoint], categoricalFeaturesInfo: Map[Int, Int], @@ -417,6 +423,7 @@ object RandomForest extends Serializable with Logging { /** * Java-friendly API for [[org.apache.spark.mllib.tree.RandomForest$#trainRegressor]] */ + @Since("1.2.0") def trainRegressor( input: JavaRDD[LabeledPoint], categoricalFeaturesInfo: java.util.Map[java.lang.Integer, java.lang.Integer], @@ -434,6 +441,7 @@ object RandomForest extends Serializable with Logging { /** * List of supported feature subset sampling strategies. */ + @Since("1.2.0") val supportedFeatureSubsetStrategies: Array[String] = Array("auto", "all", "sqrt", "log2", "onethird") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala index b6099259971b..853c7319ec44 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala @@ -17,15 +17,18 @@ package org.apache.spark.mllib.tree.configuration -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} /** * :: Experimental :: * Enum to select the algorithm for the decision tree */ +@Since("1.0.0") @Experimental object Algo extends Enumeration { + @Since("1.0.0") type Algo = Value + @Since("1.0.0") val Classification, Regression = Value private[mllib] def fromString(name: String): Algo = name match { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala index 9fd30c9b5631..b5c72fba3ede 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.tree.configuration import scala.beans.BeanProperty -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss} @@ -39,15 +39,16 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss} * then stop. Ignored when * [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used. */ +@Since("1.2.0") @Experimental -case class BoostingStrategy( +case class BoostingStrategy @Since("1.4.0") ( // Required boosting parameters - @BeanProperty var treeStrategy: Strategy, - @BeanProperty var loss: Loss, + @Since("1.2.0") @BeanProperty var treeStrategy: Strategy, + @Since("1.2.0") @BeanProperty var loss: Loss, // Optional boosting parameters - @BeanProperty var numIterations: Int = 100, - @BeanProperty var learningRate: Double = 0.1, - @BeanProperty var validationTol: Double = 1e-5) extends Serializable { + @Since("1.2.0") @BeanProperty var numIterations: Int = 100, + @Since("1.2.0") @BeanProperty var learningRate: Double = 0.1, + @Since("1.4.0") @BeanProperty var validationTol: Double = 1e-5) extends Serializable { /** * Check validity of parameters. @@ -70,6 +71,7 @@ case class BoostingStrategy( } } +@Since("1.2.0") @Experimental object BoostingStrategy { @@ -78,6 +80,7 @@ object BoostingStrategy { * @param algo Learning goal. Supported: "Classification" or "Regression" * @return Configuration for boosting algorithm */ + @Since("1.2.0") def defaultParams(algo: String): BoostingStrategy = { defaultParams(Algo.fromString(algo)) } @@ -89,8 +92,9 @@ object BoostingStrategy { * [[org.apache.spark.mllib.tree.configuration.Algo.Regression]] * @return Configuration for boosting algorithm */ + @Since("1.3.0") def defaultParams(algo: Algo): BoostingStrategy = { - val treeStrategy = Strategy.defaultStategy(algo) + val treeStrategy = Strategy.defaultStrategy(algo) treeStrategy.maxDepth = 3 algo match { case Algo.Classification => diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala index f4c877232750..4e0cd473def0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala @@ -17,14 +17,17 @@ package org.apache.spark.mllib.tree.configuration -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} /** * :: Experimental :: * Enum to describe whether a feature is "continuous" or "categorical" */ +@Since("1.0.0") @Experimental object FeatureType extends Enumeration { + @Since("1.0.0") type FeatureType = Value + @Since("1.0.0") val Continuous, Categorical = Value } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala index 7da976e55a72..8262db8a4f11 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala @@ -17,14 +17,17 @@ package org.apache.spark.mllib.tree.configuration -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} /** * :: Experimental :: * Enum for selecting the quantile calculation strategy */ +@Since("1.0.0") @Experimental object QuantileStrategy extends Enumeration { + @Since("1.0.0") type QuantileStrategy = Value + @Since("1.0.0") val Sort, MinMax, ApproxHist = Value } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala index ada227c200a7..89cc13b7c06c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.tree.configuration import scala.beans.BeanProperty import scala.collection.JavaConverters._ -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.tree.impurity.{Variance, Entropy, Gini, Impurity} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._ @@ -67,26 +67,33 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._ * the checkpoint directory is not set in * [[org.apache.spark.SparkContext]], this setting is ignored. */ +@Since("1.0.0") @Experimental -class Strategy ( - @BeanProperty var algo: Algo, - @BeanProperty var impurity: Impurity, - @BeanProperty var maxDepth: Int, - @BeanProperty var numClasses: Int = 2, - @BeanProperty var maxBins: Int = 32, - @BeanProperty var quantileCalculationStrategy: QuantileStrategy = Sort, - @BeanProperty var categoricalFeaturesInfo: Map[Int, Int] = Map[Int, Int](), - @BeanProperty var minInstancesPerNode: Int = 1, - @BeanProperty var minInfoGain: Double = 0.0, - @BeanProperty var maxMemoryInMB: Int = 256, - @BeanProperty var subsamplingRate: Double = 1, - @BeanProperty var useNodeIdCache: Boolean = false, - @BeanProperty var checkpointInterval: Int = 10) extends Serializable { +class Strategy @Since("1.3.0") ( + @Since("1.0.0") @BeanProperty var algo: Algo, + @Since("1.0.0") @BeanProperty var impurity: Impurity, + @Since("1.0.0") @BeanProperty var maxDepth: Int, + @Since("1.2.0") @BeanProperty var numClasses: Int = 2, + @Since("1.0.0") @BeanProperty var maxBins: Int = 32, + @Since("1.0.0") @BeanProperty var quantileCalculationStrategy: QuantileStrategy = Sort, + @Since("1.0.0") @BeanProperty var categoricalFeaturesInfo: Map[Int, Int] = Map[Int, Int](), + @Since("1.2.0") @BeanProperty var minInstancesPerNode: Int = 1, + @Since("1.2.0") @BeanProperty var minInfoGain: Double = 0.0, + @Since("1.0.0") @BeanProperty var maxMemoryInMB: Int = 256, + @Since("1.2.0") @BeanProperty var subsamplingRate: Double = 1, + @Since("1.2.0") @BeanProperty var useNodeIdCache: Boolean = false, + @Since("1.2.0") @BeanProperty var checkpointInterval: Int = 10) extends Serializable { + /** + */ + @Since("1.2.0") def isMulticlassClassification: Boolean = { algo == Classification && numClasses > 2 } + /** + */ + @Since("1.2.0") def isMulticlassWithCategoricalFeatures: Boolean = { isMulticlassClassification && (categoricalFeaturesInfo.size > 0) } @@ -94,6 +101,7 @@ class Strategy ( /** * Java-friendly constructor for [[org.apache.spark.mllib.tree.configuration.Strategy]] */ + @Since("1.1.0") def this( algo: Algo, impurity: Impurity, @@ -108,6 +116,7 @@ class Strategy ( /** * Sets Algorithm using a String. */ + @Since("1.2.0") def setAlgo(algo: String): Unit = algo match { case "Classification" => setAlgo(Classification) case "Regression" => setAlgo(Regression) @@ -116,6 +125,7 @@ class Strategy ( /** * Sets categoricalFeaturesInfo using a Java Map. */ + @Since("1.2.0") def setCategoricalFeaturesInfo( categoricalFeaturesInfo: java.util.Map[java.lang.Integer, java.lang.Integer]): Unit = { this.categoricalFeaturesInfo = @@ -148,11 +158,6 @@ class Strategy ( s" Valid values are integers >= 0.") require(maxBins >= 2, s"DecisionTree Strategy given invalid maxBins parameter: $maxBins." + s" Valid values are integers >= 2.") - categoricalFeaturesInfo.foreach { case (feature, arity) => - require(arity >= 2, - s"DecisionTree Strategy given invalid categoricalFeaturesInfo setting:" + - s" feature $feature has $arity categories. The number of categories should be >= 2.") - } require(minInstancesPerNode >= 1, s"DecisionTree Strategy requires minInstancesPerNode >= 1 but was given $minInstancesPerNode") require(maxMemoryInMB <= 10240, @@ -162,7 +167,10 @@ class Strategy ( s"$subsamplingRate") } - /** Returns a shallow copy of this instance. */ + /** + * Returns a shallow copy of this instance. + */ + @Since("1.2.0") def copy: Strategy = { new Strategy(algo, impurity, maxDepth, numClasses, maxBins, quantileCalculationStrategy, categoricalFeaturesInfo, minInstancesPerNode, minInfoGain, @@ -170,6 +178,7 @@ class Strategy ( } } +@Since("1.2.0") @Experimental object Strategy { @@ -177,15 +186,17 @@ object Strategy { * Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]] * @param algo "Classification" or "Regression" */ + @Since("1.2.0") def defaultStrategy(algo: String): Strategy = { - defaultStategy(Algo.fromString(algo)) + defaultStrategy(Algo.fromString(algo)) } /** * Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]] * @param algo Algo.Classification or Algo.Regression */ - def defaultStategy(algo: Algo): Strategy = algo match { + @Since("1.3.0") + def defaultStrategy(algo: Algo): Strategy = algo match { case Algo.Classification => new Strategy(algo = Classification, impurity = Gini, maxDepth = 10, numClasses = 2) @@ -193,4 +204,9 @@ object Strategy { new Strategy(algo = Regression, impurity = Variance, maxDepth = 10, numClasses = 0) } + + @deprecated("Use Strategy.defaultStrategy instead.", "1.5.0") + @Since("1.2.0") + def defaultStategy(algo: Algo): Strategy = defaultStrategy(algo) + } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala index 9fe264656ede..21ee49c45788 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala @@ -144,21 +144,28 @@ private[spark] object DecisionTreeMetadata extends Logging { val maxCategoriesForUnorderedFeature = ((math.log(maxPossibleBins / 2 + 1) / math.log(2.0)) + 1).floor.toInt strategy.categoricalFeaturesInfo.foreach { case (featureIndex, numCategories) => - // Decide if some categorical features should be treated as unordered features, - // which require 2 * ((1 << numCategories - 1) - 1) bins. - // We do this check with log values to prevent overflows in case numCategories is large. - // The next check is equivalent to: 2 * ((1 << numCategories - 1) - 1) <= maxBins - if (numCategories <= maxCategoriesForUnorderedFeature) { - unorderedFeatures.add(featureIndex) - numBins(featureIndex) = numUnorderedBins(numCategories) - } else { - numBins(featureIndex) = numCategories + // Hack: If a categorical feature has only 1 category, we treat it as continuous. + // TODO(SPARK-9957): Handle this properly by filtering out those features. + if (numCategories > 1) { + // Decide if some categorical features should be treated as unordered features, + // which require 2 * ((1 << numCategories - 1) - 1) bins. + // We do this check with log values to prevent overflows in case numCategories is large. + // The next check is equivalent to: 2 * ((1 << numCategories - 1) - 1) <= maxBins + if (numCategories <= maxCategoriesForUnorderedFeature) { + unorderedFeatures.add(featureIndex) + numBins(featureIndex) = numUnorderedBins(numCategories) + } else { + numBins(featureIndex) = numCategories + } } } } else { // Binary classification or regression strategy.categoricalFeaturesInfo.foreach { case (featureIndex, numCategories) => - numBins(featureIndex) = numCategories + // If a categorical feature has only 1 category, we treat it as continuous: SPARK-9957 + if (numCategories > 1) { + numBins(featureIndex) = numCategories + } } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala index 0768204c3391..73df6b054a8c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala @@ -17,13 +17,14 @@ package org.apache.spark.mllib.tree.impurity -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} /** * :: Experimental :: * Class for calculating [[http://en.wikipedia.org/wiki/Binary_entropy_function entropy]] during * binary classification. */ +@Since("1.0.0") @Experimental object Entropy extends Impurity { @@ -36,6 +37,7 @@ object Entropy extends Impurity { * @param totalCount sum of counts for all labels * @return information value, or 0 if totalCount = 0 */ + @Since("1.1.0") @DeveloperApi override def calculate(counts: Array[Double], totalCount: Double): Double = { if (totalCount == 0) { @@ -63,6 +65,7 @@ object Entropy extends Impurity { * @param sumSquares summation of squares of the labels * @return information value, or 0 if count = 0 */ + @Since("1.0.0") @DeveloperApi override def calculate(count: Double, sum: Double, sumSquares: Double): Double = throw new UnsupportedOperationException("Entropy.calculate") @@ -71,6 +74,7 @@ object Entropy extends Impurity { * Get this impurity instance. * This is useful for passing impurity parameters to a Strategy in Java. */ + @Since("1.1.0") def instance: this.type = this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala index d0077db6832e..f21845b21a80 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.tree.impurity -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} /** * :: Experimental :: @@ -25,6 +25,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental} * [[http://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity Gini impurity]] * during binary classification. */ +@Since("1.0.0") @Experimental object Gini extends Impurity { @@ -35,6 +36,7 @@ object Gini extends Impurity { * @param totalCount sum of counts for all labels * @return information value, or 0 if totalCount = 0 */ + @Since("1.1.0") @DeveloperApi override def calculate(counts: Array[Double], totalCount: Double): Double = { if (totalCount == 0) { @@ -59,6 +61,7 @@ object Gini extends Impurity { * @param sumSquares summation of squares of the labels * @return information value, or 0 if count = 0 */ + @Since("1.0.0") @DeveloperApi override def calculate(count: Double, sum: Double, sumSquares: Double): Double = throw new UnsupportedOperationException("Gini.calculate") @@ -67,6 +70,7 @@ object Gini extends Impurity { * Get this impurity instance. * This is useful for passing impurity parameters to a Strategy in Java. */ + @Since("1.1.0") def instance: this.type = this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala index 86cee7e430b0..4637dcceea7f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.tree.impurity -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} /** * :: Experimental :: @@ -26,6 +26,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental} * (a) setting the impurity parameter in [[org.apache.spark.mllib.tree.configuration.Strategy]] * (b) calculating impurity values from sufficient statistics. */ +@Since("1.0.0") @Experimental trait Impurity extends Serializable { @@ -36,6 +37,7 @@ trait Impurity extends Serializable { * @param totalCount sum of counts for all labels * @return information value, or 0 if totalCount = 0 */ + @Since("1.1.0") @DeveloperApi def calculate(counts: Array[Double], totalCount: Double): Double @@ -47,6 +49,7 @@ trait Impurity extends Serializable { * @param sumSquares summation of squares of the labels * @return information value, or 0 if count = 0 */ + @Since("1.0.0") @DeveloperApi def calculate(count: Double, sum: Double, sumSquares: Double): Double } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala index 04d0cd24e663..a74197278d6f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala @@ -17,12 +17,13 @@ package org.apache.spark.mllib.tree.impurity -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} /** * :: Experimental :: * Class for calculating variance during regression */ +@Since("1.0.0") @Experimental object Variance extends Impurity { @@ -33,6 +34,7 @@ object Variance extends Impurity { * @param totalCount sum of counts for all labels * @return information value, or 0 if totalCount = 0 */ + @Since("1.1.0") @DeveloperApi override def calculate(counts: Array[Double], totalCount: Double): Double = throw new UnsupportedOperationException("Variance.calculate") @@ -45,6 +47,7 @@ object Variance extends Impurity { * @param sumSquares summation of squares of the labels * @return information value, or 0 if count = 0 */ + @Since("1.0.0") @DeveloperApi override def calculate(count: Double, sum: Double, sumSquares: Double): Double = { if (count == 0) { @@ -58,6 +61,7 @@ object Variance extends Impurity { * Get this impurity instance. * This is useful for passing impurity parameters to a Strategy in Java. */ + @Since("1.0.0") def instance: this.type = this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala index 2bdef73c4a8f..bab7b8c6cadf 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.tree.loss -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel @@ -30,6 +30,7 @@ import org.apache.spark.mllib.tree.model.TreeEnsembleModel * |y - F(x)| * where y is the label and F(x) is the model prediction for features x. */ +@Since("1.2.0") @DeveloperApi object AbsoluteError extends Loss { @@ -41,6 +42,7 @@ object AbsoluteError extends Loss { * @param label True label. * @return Loss gradient */ + @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { if (label - prediction < 0) 1.0 else -1.0 } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala index 778c24526de7..b2b4594712f0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.tree.loss -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.mllib.util.MLUtils @@ -32,6 +32,7 @@ import org.apache.spark.mllib.util.MLUtils * 2 log(1 + exp(-2 y F(x))) * where y is a label in {-1, 1} and F(x) is the model prediction for features x. */ +@Since("1.2.0") @DeveloperApi object LogLoss extends Loss { @@ -43,6 +44,7 @@ object LogLoss extends Loss { * @param label True label. * @return Loss gradient */ + @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala index 64ffccbce073..687cde325ffe 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.tree.loss -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.rdd.RDD @@ -27,6 +27,7 @@ import org.apache.spark.rdd.RDD * :: DeveloperApi :: * Trait for adding "pluggable" loss functions for the gradient boosting algorithm. */ +@Since("1.2.0") @DeveloperApi trait Loss extends Serializable { @@ -36,6 +37,7 @@ trait Loss extends Serializable { * @param label true label. * @return Loss gradient. */ + @Since("1.2.0") def gradient(prediction: Double, label: Double): Double /** @@ -46,6 +48,7 @@ trait Loss extends Serializable { * @param data Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * @return Measure of model error on data */ + @Since("1.2.0") def computeError(model: TreeEnsembleModel, data: RDD[LabeledPoint]): Double = { data.map(point => computeError(model.predict(point.features), point.label)).mean() } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala index 42c9ead9884b..2b112fbe1220 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala @@ -17,8 +17,12 @@ package org.apache.spark.mllib.tree.loss +import org.apache.spark.annotation.Since + +@Since("1.2.0") object Losses { + @Since("1.2.0") def fromString(name: String): Loss = name match { case "leastSquaresError" => SquaredError case "leastAbsoluteError" => AbsoluteError diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala index 011a5d57422f..3f7d3d38be16 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.tree.loss -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel @@ -30,6 +30,7 @@ import org.apache.spark.mllib.tree.model.TreeEnsembleModel * (y - F(x))**2 * where y is the label and F(x) is the model prediction for features x. */ +@Since("1.2.0") @DeveloperApi object SquaredError extends Loss { @@ -41,6 +42,7 @@ object SquaredError extends Loss { * @param label True label. * @return Loss gradient */ + @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 2.0 * (label - prediction) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala index f2c78bbabff0..e1bf23f4c34b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala @@ -24,7 +24,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.{Logging, SparkContext} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.tree.configuration.{Algo, FeatureType} @@ -41,8 +41,11 @@ import org.apache.spark.util.Utils * @param topNode root node * @param algo algorithm type -- classification or regression */ +@Since("1.0.0") @Experimental -class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable with Saveable { +class DecisionTreeModel @Since("1.0.0") ( + @Since("1.0.0") val topNode: Node, + @Since("1.0.0") val algo: Algo) extends Serializable with Saveable { /** * Predict values for a single data point using the model trained. @@ -50,6 +53,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable * @param features array representing a single data point * @return Double prediction from the trained model */ + @Since("1.0.0") def predict(features: Vector): Double = { topNode.predict(features) } @@ -60,6 +64,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable * @param features RDD representing data points to be predicted * @return RDD of predictions for each of the given data points */ + @Since("1.0.0") def predict(features: RDD[Vector]): RDD[Double] = { features.map(x => predict(x)) } @@ -70,6 +75,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable * @param features JavaRDD representing data points to be predicted * @return JavaRDD of predictions for each of the given data points */ + @Since("1.2.0") def predict(features: JavaRDD[Vector]): JavaRDD[Double] = { predict(features.rdd) } @@ -77,6 +83,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable /** * Get number of nodes in tree, including leaf nodes. */ + @Since("1.1.0") def numNodes: Int = { 1 + topNode.numDescendants } @@ -85,6 +92,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable * Get depth of tree. * E.g.: Depth 0 means 1 leaf node. Depth 1 means 1 internal node and 2 leaf nodes. */ + @Since("1.1.0") def depth: Int = { topNode.subtreeDepth } @@ -104,11 +112,18 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable /** * Print the full model to a string. */ + @Since("1.2.0") def toDebugString: String = { val header = toString + "\n" header + topNode.subtreeToString(2) } + /** + * @param sc Spark context used to save model data. + * @param path Path specifying the directory in which to save this model. + * If the directory already exists, this method throws an exception. + */ + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { DecisionTreeModel.SaveLoadV1_0.save(sc, path, this) } @@ -116,6 +131,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable override protected def formatVersion: String = DecisionTreeModel.formatVersion } +@Since("1.3.0") object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging { private[spark] def formatVersion: String = "1.0" @@ -297,6 +313,13 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging { } } + /** + * + * @param sc Spark context used for loading model files. + * @param path Path specifying the directory to which the model was saved. + * @return Model instance + */ + @Since("1.3.0") override def load(sc: SparkContext, path: String): DecisionTreeModel = { implicit val formats = DefaultFormats val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala index 508bf9c1bdb4..091a0462c204 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.tree.model -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.tree.impurity.ImpurityCalculator /** @@ -30,6 +30,7 @@ import org.apache.spark.mllib.tree.impurity.ImpurityCalculator * @param leftPredict left node predict * @param rightPredict right node predict */ +@Since("1.0.0") @DeveloperApi class InformationGainStats( val gain: Double, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala index a6d1398fc267..ea6e5aa5d94e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.tree.model -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.Logging import org.apache.spark.mllib.tree.configuration.FeatureType._ import org.apache.spark.mllib.linalg.Vector @@ -39,16 +39,17 @@ import org.apache.spark.mllib.linalg.Vector * @param rightNode right child * @param stats information gain stats */ +@Since("1.0.0") @DeveloperApi -class Node ( - val id: Int, - var predict: Predict, - var impurity: Double, - var isLeaf: Boolean, - var split: Option[Split], - var leftNode: Option[Node], - var rightNode: Option[Node], - var stats: Option[InformationGainStats]) extends Serializable with Logging { +class Node @Since("1.2.0") ( + @Since("1.0.0") val id: Int, + @Since("1.0.0") var predict: Predict, + @Since("1.2.0") var impurity: Double, + @Since("1.0.0") var isLeaf: Boolean, + @Since("1.0.0") var split: Option[Split], + @Since("1.0.0") var leftNode: Option[Node], + @Since("1.0.0") var rightNode: Option[Node], + @Since("1.0.0") var stats: Option[InformationGainStats]) extends Serializable with Logging { override def toString: String = { s"id = $id, isLeaf = $isLeaf, predict = $predict, impurity = $impurity, " + @@ -59,6 +60,7 @@ class Node ( * build the left node and right nodes if not leaf * @param nodes array of nodes */ + @Since("1.0.0") @deprecated("build should no longer be used since trees are constructed on-the-fly in training", "1.2.0") def build(nodes: Array[Node]): Unit = { @@ -80,6 +82,7 @@ class Node ( * @param features feature value * @return predicted value */ + @Since("1.1.0") def predict(features: Vector) : Double = { if (isLeaf) { predict.predict diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala index 5cbe7c280dbe..06ceff19d863 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala @@ -17,17 +17,18 @@ package org.apache.spark.mllib.tree.model -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} /** * Predicted value for a node * @param predict predicted value * @param prob probability of the label (classification only) */ +@Since("1.2.0") @DeveloperApi -class Predict( - val predict: Double, - val prob: Double = 0.0) extends Serializable { +class Predict @Since("1.2.0") ( + @Since("1.2.0") val predict: Double, + @Since("1.2.0") val prob: Double = 0.0) extends Serializable { override def toString: String = s"$predict (prob = $prob)" diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala index be6c9b3de547..b85a66c05a81 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.tree.model -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType import org.apache.spark.mllib.tree.configuration.FeatureType import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType @@ -31,12 +31,13 @@ import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType * @param featureType type of feature -- categorical or continuous * @param categories Split left if categorical feature value is in this set, else right. */ +@Since("1.0.0") @DeveloperApi case class Split( - feature: Int, - threshold: Double, - featureType: FeatureType, - categories: List[Double]) { + @Since("1.0.0") feature: Int, + @Since("1.0.0") threshold: Double, + @Since("1.0.0") featureType: FeatureType, + @Since("1.0.0") categories: List[Double]) { override def toString: String = { s"Feature = $feature, threshold = $threshold, featureType = $featureType, " + diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala index 905c5fb42bd4..df5b8feab5d5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala @@ -25,7 +25,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.{Logging, SparkContext} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint @@ -46,14 +46,24 @@ import org.apache.spark.util.Utils * @param algo algorithm for the ensemble model, either Classification or Regression * @param trees tree ensembles */ +@Since("1.2.0") @Experimental -class RandomForestModel(override val algo: Algo, override val trees: Array[DecisionTreeModel]) +class RandomForestModel @Since("1.2.0") ( + @Since("1.2.0") override val algo: Algo, + @Since("1.2.0") override val trees: Array[DecisionTreeModel]) extends TreeEnsembleModel(algo, trees, Array.fill(trees.length)(1.0), combiningStrategy = if (algo == Classification) Vote else Average) with Saveable { require(trees.forall(_.algo == algo)) + /** + * + * @param sc Spark context used to save model data. + * @param path Path specifying the directory in which to save this model. + * If the directory already exists, this method throws an exception. + */ + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { TreeEnsembleModel.SaveLoadV1_0.save(sc, path, this, RandomForestModel.SaveLoadV1_0.thisClassName) @@ -62,10 +72,18 @@ class RandomForestModel(override val algo: Algo, override val trees: Array[Decis override protected def formatVersion: String = RandomForestModel.formatVersion } +@Since("1.3.0") object RandomForestModel extends Loader[RandomForestModel] { private[mllib] def formatVersion: String = TreeEnsembleModel.SaveLoadV1_0.thisFormatVersion + /** + * + * @param sc Spark context used for loading model files. + * @param path Path specifying the directory to which the model was saved. + * @return Model instance + */ + @Since("1.3.0") override def load(sc: SparkContext, path: String): RandomForestModel = { val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path) val classNameV1_0 = SaveLoadV1_0.thisClassName @@ -97,16 +115,23 @@ object RandomForestModel extends Loader[RandomForestModel] { * @param trees tree ensembles * @param treeWeights tree ensemble weights */ +@Since("1.2.0") @Experimental -class GradientBoostedTreesModel( - override val algo: Algo, - override val trees: Array[DecisionTreeModel], - override val treeWeights: Array[Double]) +class GradientBoostedTreesModel @Since("1.2.0") ( + @Since("1.2.0") override val algo: Algo, + @Since("1.2.0") override val trees: Array[DecisionTreeModel], + @Since("1.2.0") override val treeWeights: Array[Double]) extends TreeEnsembleModel(algo, trees, treeWeights, combiningStrategy = Sum) with Saveable { require(trees.length == treeWeights.length) + /** + * @param sc Spark context used to save model data. + * @param path Path specifying the directory in which to save this model. + * If the directory already exists, this method throws an exception. + */ + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { TreeEnsembleModel.SaveLoadV1_0.save(sc, path, this, GradientBoostedTreesModel.SaveLoadV1_0.thisClassName) @@ -119,6 +144,7 @@ class GradientBoostedTreesModel( * @return an array with index i having the losses or errors for the ensemble * containing the first i+1 trees */ + @Since("1.4.0") def evaluateEachIteration( data: RDD[LabeledPoint], loss: Loss): Array[Double] = { @@ -159,6 +185,9 @@ class GradientBoostedTreesModel( override protected def formatVersion: String = GradientBoostedTreesModel.formatVersion } +/** + */ +@Since("1.3.0") object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] { /** @@ -171,6 +200,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] { * @return a RDD with each element being a zip of the prediction and error * corresponding to every sample. */ + @Since("1.4.0") def computeInitialPredictionAndError( data: RDD[LabeledPoint], initTreeWeight: Double, @@ -194,6 +224,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] { * @return a RDD with each element being a zip of the prediction and error * corresponding to each sample. */ + @Since("1.4.0") def updatePredictionError( data: RDD[LabeledPoint], predictionAndError: RDD[(Double, Double)], @@ -213,6 +244,12 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] { private[mllib] def formatVersion: String = TreeEnsembleModel.SaveLoadV1_0.thisFormatVersion + /** + * @param sc Spark context used for loading model files. + * @param path Path specifying the directory to which the model was saved. + * @return Model instance + */ + @Since("1.3.0") override def load(sc: SparkContext, path: String): GradientBoostedTreesModel = { val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path) val classNameV1_0 = SaveLoadV1_0.thisClassName diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala index be335a1aca58..dffe6e78939e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala @@ -17,16 +17,17 @@ package org.apache.spark.mllib.util -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.Logging -import org.apache.spark.rdd.RDD +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD /** * :: DeveloperApi :: * A collection of methods used to validate data before applying ML algorithms. */ @DeveloperApi +@Since("0.8.0") object DataValidators extends Logging { /** @@ -34,6 +35,7 @@ object DataValidators extends Logging { * * @return True if labels are all zero or one, false otherwise. */ + @Since("1.0.0") val binaryLabelValidator: RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label != 1.0 && x.label != 0.0).count() if (numInvalid != 0) { @@ -48,6 +50,7 @@ object DataValidators extends Logging { * * @return True if labels are all in the range of {0, 1, ..., k-1}, false otherwise. */ + @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala index e6bcff48b022..00fd1606a369 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala @@ -19,8 +19,8 @@ package org.apache.spark.mllib.util import scala.util.Random -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.SparkContext +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.rdd.RDD /** @@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD * cluster with scale 1 around each center. */ @DeveloperApi +@Since("0.8.0") object KMeansDataGenerator { /** @@ -42,6 +43,7 @@ object KMeansDataGenerator { * @param r Scaling factor for the distribution of the initial centers * @param numPartitions Number of partitions of the generated RDD; default 2 */ + @Since("0.8.0") def generateKMeansRDD( sc: SparkContext, numPoints: Int, @@ -62,6 +64,7 @@ object KMeansDataGenerator { } } + @Since("0.8.0") def main(args: Array[String]) { if (args.length < 6) { // scalastyle:off println diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index 87eeb5db05d2..8b0a02e0582b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -22,11 +22,11 @@ import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD /** * :: DeveloperApi :: @@ -35,6 +35,7 @@ import org.apache.spark.mllib.regression.LabeledPoint * response variable `Y`. */ @DeveloperApi +@Since("0.8.0") object LinearDataGenerator { /** @@ -46,6 +47,7 @@ object LinearDataGenerator { * @param seed Random seed * @return Java List of input. */ + @Since("0.8.0") def generateLinearInputAsList( intercept: Double, weights: Array[Double], @@ -68,6 +70,7 @@ object LinearDataGenerator { * @param eps Epsilon scaling factor. * @return Seq of input. */ + @Since("0.8.0") def generateLinearInput( intercept: Double, weights: Array[Double], @@ -92,6 +95,7 @@ object LinearDataGenerator { * @param eps Epsilon scaling factor. * @return Seq of input. */ + @Since("0.8.0") def generateLinearInput( intercept: Double, weights: Array[Double], @@ -132,6 +136,7 @@ object LinearDataGenerator { * * @return RDD of LabeledPoint containing sample data. */ + @Since("0.8.0") def generateLinearRDD( sc: SparkContext, nexamples: Int, @@ -151,6 +156,7 @@ object LinearDataGenerator { data } + @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala index c09cbe69bb97..33477ee20ebb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.util import scala.util.Random -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{Since, DeveloperApi} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint @@ -31,6 +31,7 @@ import org.apache.spark.mllib.linalg.Vectors * with probability `probOne` and scales features for positive examples by `eps`. */ @DeveloperApi +@Since("0.8.0") object LogisticRegressionDataGenerator { /** @@ -43,6 +44,7 @@ object LogisticRegressionDataGenerator { * @param nparts Number of partitions of the generated RDD. Default value is 2. * @param probOne Probability that a label is 1 (and not 0). Default value is 0.5. */ + @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, @@ -62,6 +64,7 @@ object LogisticRegressionDataGenerator { data } + @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala index 16f430599a51..906bd30563bd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala @@ -23,7 +23,7 @@ import scala.language.postfixOps import scala.util.Random import org.apache.spark.SparkContext -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{Since, DeveloperApi} import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix} import org.apache.spark.rdd.RDD @@ -52,7 +52,9 @@ import org.apache.spark.rdd.RDD * testSampFact (Double) Percentage of training data to use as test data. */ @DeveloperApi +@Since("0.8.0") object MFDataGenerator { + @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 7c5cfa7bd84c..81c2f0ce6e12 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -21,7 +21,7 @@ import scala.reflect.ClassTag import breeze.linalg.{DenseVector => BDV, SparseVector => BSV} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.rdd.PartitionwiseSampledRDD @@ -36,6 +36,7 @@ import org.apache.spark.streaming.dstream.DStream /** * Helper methods to load, save and pre-process data used in ML Lib. */ +@Since("0.8.0") object MLUtils { private[mllib] lazy val EPSILON = { @@ -65,6 +66,7 @@ object MLUtils { * @param minPartitions min number of partitions * @return labeled data stored as an RDD[LabeledPoint] */ + @Since("1.0.0") def loadLibSVMFile( sc: SparkContext, path: String, @@ -114,6 +116,7 @@ object MLUtils { // Convenient methods for `loadLibSVMFile`. + @Since("1.0.0") @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0") def loadLibSVMFile( sc: SparkContext, @@ -127,12 +130,14 @@ object MLUtils { * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of * partitions. */ + @Since("1.0.0") def loadLibSVMFile( sc: SparkContext, path: String, numFeatures: Int): RDD[LabeledPoint] = loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions) + @Since("1.0.0") @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0") def loadLibSVMFile( sc: SparkContext, @@ -141,6 +146,7 @@ object MLUtils { numFeatures: Int): RDD[LabeledPoint] = loadLibSVMFile(sc, path, numFeatures) + @Since("1.0.0") @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0") def loadLibSVMFile( sc: SparkContext, @@ -152,6 +158,7 @@ object MLUtils { * Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], with number of * features determined automatically and the default number of partitions. */ + @Since("1.0.0") def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] = loadLibSVMFile(sc, path, -1) @@ -162,6 +169,7 @@ object MLUtils { * * @see [[org.apache.spark.mllib.util.MLUtils#loadLibSVMFile]] */ + @Since("1.0.0") def saveAsLibSVMFile(data: RDD[LabeledPoint], dir: String) { // TODO: allow to specify label precision and feature precision. val dataStr = data.map { case LabeledPoint(label, features) => @@ -182,12 +190,14 @@ object MLUtils { * @param minPartitions min number of partitions * @return vectors stored as an RDD[Vector] */ + @Since("1.1.0") def loadVectors(sc: SparkContext, path: String, minPartitions: Int): RDD[Vector] = sc.textFile(path, minPartitions).map(Vectors.parse) /** * Loads vectors saved using `RDD[Vector].saveAsTextFile` with the default number of partitions. */ + @Since("1.1.0") def loadVectors(sc: SparkContext, path: String): RDD[Vector] = sc.textFile(path, sc.defaultMinPartitions).map(Vectors.parse) @@ -198,6 +208,7 @@ object MLUtils { * @param minPartitions min number of partitions * @return labeled points stored as an RDD[LabeledPoint] */ + @Since("1.1.0") def loadLabeledPoints(sc: SparkContext, path: String, minPartitions: Int): RDD[LabeledPoint] = sc.textFile(path, minPartitions).map(LabeledPoint.parse) @@ -205,6 +216,7 @@ object MLUtils { * Loads labeled points saved using `RDD[LabeledPoint].saveAsTextFile` with the default number of * partitions. */ + @Since("1.1.0") def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] = loadLabeledPoints(sc, dir, sc.defaultMinPartitions) @@ -221,6 +233,7 @@ object MLUtils { * @deprecated Should use [[org.apache.spark.rdd.RDD#saveAsTextFile]] for saving and * [[org.apache.spark.mllib.util.MLUtils#loadLabeledPoints]] for loading. */ + @Since("1.0.0") @deprecated("Should use MLUtils.loadLabeledPoints instead.", "1.0.1") def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = { sc.textFile(dir).map { line => @@ -242,6 +255,7 @@ object MLUtils { * @deprecated Should use [[org.apache.spark.rdd.RDD#saveAsTextFile]] for saving and * [[org.apache.spark.mllib.util.MLUtils#loadLabeledPoints]] for loading. */ + @Since("1.0.0") @deprecated("Should use RDD[LabeledPoint].saveAsTextFile instead.", "1.0.1") def saveLabeledData(data: RDD[LabeledPoint], dir: String) { val dataStr = data.map(x => x.label + "," + x.features.toArray.mkString(" ")) @@ -254,6 +268,7 @@ object MLUtils { * containing the training data, a complement of the validation data and the second * element, the validation data, containing a unique 1/kth of the data. Where k=numFolds. */ + @Since("1.0.0") @Experimental def kFold[T: ClassTag](rdd: RDD[T], numFolds: Int, seed: Int): Array[(RDD[T], RDD[T])] = { val numFoldsF = numFolds.toFloat @@ -269,6 +284,7 @@ object MLUtils { /** * Returns a new vector with `1.0` (bias) appended to the input vector. */ + @Since("1.0.0") def appendBias(vector: Vector): Vector = { vector match { case dv: DenseVector => diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala index ad20b7694a77..cde597939617 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala @@ -21,11 +21,11 @@ import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD /** * :: DeveloperApi :: @@ -33,8 +33,10 @@ import org.apache.spark.mllib.regression.LabeledPoint * for the features and adds Gaussian noise with weight 0.1 to generate labels. */ @DeveloperApi +@Since("0.8.0") object SVMDataGenerator { + @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala index 30d642c754b7..4d71d534a077 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala @@ -24,7 +24,7 @@ import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.types.{DataType, StructField, StructType} @@ -35,6 +35,7 @@ import org.apache.spark.sql.types.{DataType, StructField, StructType} * This should be inherited by the class which implements model instances. */ @DeveloperApi +@Since("1.3.0") trait Saveable { /** @@ -50,6 +51,7 @@ trait Saveable { * @param path Path specifying the directory in which to save this model. * If the directory already exists, this method throws an exception. */ + @Since("1.3.0") def save(sc: SparkContext, path: String): Unit /** Current version of model save/load format. */ @@ -64,6 +66,7 @@ trait Saveable { * This should be inherited by an object paired with the model class. */ @DeveloperApi +@Since("1.3.0") trait Loader[M <: Saveable] { /** @@ -75,6 +78,7 @@ trait Loader[M <: Saveable] { * @param path Path specifying the directory to which the model was saved. * @return Model instance */ + @Since("1.3.0") def load(sc: SparkContext, path: String): M } diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java index f75e024a713e..618b95b9bd12 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java @@ -87,6 +87,8 @@ public void logisticRegressionWithSetters() { LogisticRegression parent = (LogisticRegression) model.parent(); assert(parent.getMaxIter() == 10); assert(parent.getRegParam() == 1.0); + assert(parent.getThresholds()[0] == 0.4); + assert(parent.getThresholds()[1] == 0.6); assert(parent.getThreshold() == 0.6); assert(model.getThreshold() == 0.6); @@ -147,4 +149,13 @@ public void logisticRegressionPredictorClassifierMethods() { } } } + + @Test + public void logisticRegressionTrainingSummary() { + LogisticRegression lr = new LogisticRegression(); + LogisticRegressionModel model = lr.fit(dataset); + + LogisticRegressionTrainingSummary summary = model.summary(); + assert(summary.totalIterations() == summary.objectiveHistory().length); + } } diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaMultilayerPerceptronClassifierSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaMultilayerPerceptronClassifierSuite.java new file mode 100644 index 000000000000..ec6b4bf3c0f8 --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaMultilayerPerceptronClassifierSuite.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.classification; + +import java.io.Serializable; +import java.util.Arrays; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; + +public class JavaMultilayerPerceptronClassifierSuite implements Serializable { + + private transient JavaSparkContext jsc; + private transient SQLContext sqlContext; + + @Before + public void setUp() { + jsc = new JavaSparkContext("local", "JavaLogisticRegressionSuite"); + sqlContext = new SQLContext(jsc); + } + + @After + public void tearDown() { + jsc.stop(); + jsc = null; + sqlContext = null; + } + + @Test + public void testMLPC() { + DataFrame dataFrame = sqlContext.createDataFrame( + jsc.parallelize(Arrays.asList( + new LabeledPoint(0.0, Vectors.dense(0.0, 0.0)), + new LabeledPoint(1.0, Vectors.dense(0.0, 1.0)), + new LabeledPoint(1.0, Vectors.dense(1.0, 0.0)), + new LabeledPoint(0.0, Vectors.dense(1.0, 1.0)))), + LabeledPoint.class); + MultilayerPerceptronClassifier mlpc = new MultilayerPerceptronClassifier() + .setLayers(new int[] {2, 5, 2}) + .setBlockSize(1) + .setSeed(11L) + .setMaxIter(100); + MultilayerPerceptronClassificationModel model = mlpc.fit(dataFrame); + DataFrame result = model.transform(dataFrame); + Row[] predictionAndLabels = result.select("prediction", "label").collect(); + for (Row r: predictionAndLabels) { + Assert.assertEquals((int) r.getDouble(0), (int) r.getDouble(1)); + } + } +} diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java index 32d0b3856b7e..a66a1e12927b 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaRandomForestClassifierSuite.java @@ -29,6 +29,7 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.impl.TreeTests; import org.apache.spark.mllib.classification.LogisticRegressionSuite; +import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.DataFrame; @@ -85,6 +86,7 @@ public void runDT() { model.toDebugString(); model.trees(); model.treeWeights(); + Vector importances = model.featureImportances(); /* // TODO: Add test once save/load are implemented. SPARK-6725 diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java new file mode 100644 index 000000000000..76cdd0fae84a --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature; + +import java.util.Arrays; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + + +public class JavaStopWordsRemoverSuite { + + private transient JavaSparkContext jsc; + private transient SQLContext jsql; + + @Before + public void setUp() { + jsc = new JavaSparkContext("local", "JavaStopWordsRemoverSuite"); + jsql = new SQLContext(jsc); + } + + @After + public void tearDown() { + jsc.stop(); + jsc = null; + } + + @Test + public void javaCompatibilityTest() { + StopWordsRemover remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered"); + + JavaRDD rdd = jsc.parallelize(Arrays.asList( + RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), + RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) + )); + StructType schema = new StructType(new StructField[] { + new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) + }); + DataFrame dataset = jsql.createDataFrame(rdd, schema); + + remover.transform(dataset).collect(); + } +} diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java new file mode 100644 index 000000000000..56988b9fb29c --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature; + +import com.google.common.collect.Lists; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.attribute.Attribute; +import org.apache.spark.ml.attribute.AttributeGroup; +import org.apache.spark.ml.attribute.NumericAttribute; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.StructType; + + +public class JavaVectorSlicerSuite { + private transient JavaSparkContext jsc; + private transient SQLContext jsql; + + @Before + public void setUp() { + jsc = new JavaSparkContext("local", "JavaVectorSlicerSuite"); + jsql = new SQLContext(jsc); + } + + @After + public void tearDown() { + jsc.stop(); + jsc = null; + } + + @Test + public void vectorSlice() { + Attribute[] attrs = new Attribute[]{ + NumericAttribute.defaultAttr().withName("f1"), + NumericAttribute.defaultAttr().withName("f2"), + NumericAttribute.defaultAttr().withName("f3") + }; + AttributeGroup group = new AttributeGroup("userFeatures", attrs); + + JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( + RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), + RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) + )); + + DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField())); + + VectorSlicer vectorSlicer = new VectorSlicer() + .setInputCol("userFeatures").setOutputCol("features"); + + vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); + + DataFrame output = vectorSlicer.transform(dataset); + + for (Row r : output.select("userFeatures", "features").take(2)) { + Vector features = r.getAs(1); + Assert.assertEquals(features.size(), 2); + } + } +} diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java index e306ebadfe7c..a00ce5e249c3 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java @@ -29,6 +29,7 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.classification.LogisticRegressionSuite; import org.apache.spark.ml.impl.TreeTests; +import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.DataFrame; @@ -85,6 +86,7 @@ public void runDT() { model.toDebugString(); model.trees(); model.treeWeights(); + Vector importances = model.featureImportances(); /* // TODO: Add test once save/load are implemented. SPARK-6725 diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java index d272a42c8576..3fea359a3b46 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java @@ -22,12 +22,14 @@ import java.util.Arrays; import scala.Tuple2; +import scala.Tuple3; import org.junit.After; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertArrayEquals; import org.junit.Before; import org.junit.Test; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.JavaPairRDD; @@ -44,9 +46,9 @@ public class JavaLDASuite implements Serializable { public void setUp() { sc = new JavaSparkContext("local", "JavaLDA"); ArrayList> tinyCorpus = new ArrayList>(); - for (int i = 0; i < LDASuite$.MODULE$.tinyCorpus().length; i++) { - tinyCorpus.add(new Tuple2((Long)LDASuite$.MODULE$.tinyCorpus()[i]._1(), - LDASuite$.MODULE$.tinyCorpus()[i]._2())); + for (int i = 0; i < LDASuite.tinyCorpus().length; i++) { + tinyCorpus.add(new Tuple2((Long)LDASuite.tinyCorpus()[i]._1(), + LDASuite.tinyCorpus()[i]._2())); } JavaRDD> tmpCorpus = sc.parallelize(tinyCorpus, 2); corpus = JavaPairRDD.fromJavaRDD(tmpCorpus); @@ -60,7 +62,7 @@ public void tearDown() { @Test public void localLDAModel() { - Matrix topics = LDASuite$.MODULE$.tinyTopics(); + Matrix topics = LDASuite.tinyTopics(); double[] topicConcentration = new double[topics.numRows()]; Arrays.fill(topicConcentration, 1.0D / topics.numRows()); LocalLDAModel model = new LocalLDAModel(topics, Vectors.dense(topicConcentration), 1D, 100D); @@ -110,8 +112,8 @@ public void distributedLDAModel() { assertEquals(roundedLocalTopicSummary.length, k); // Check: log probabilities - assert(model.logLikelihood() < 0.0); - assert(model.logPrior() < 0.0); + assertTrue(model.logLikelihood() < 0.0); + assertTrue(model.logPrior() < 0.0); // Check: topic distributions JavaPairRDD topicDistributions = model.javaTopicDistributions(); @@ -124,6 +126,21 @@ public Boolean call(Tuple2 tuple2) { } }); assertEquals(topicDistributions.count(), nonEmptyCorpus.count()); + + // Check: javaTopTopicsPerDocuments + Tuple3 topTopics = model.javaTopTopicsPerDocument(3).first(); + Long docId = topTopics._1(); // confirm doc ID type + int[] topicIndices = topTopics._2(); + double[] topicWeights = topTopics._3(); + assertEquals(3, topicIndices.length); + assertEquals(3, topicWeights.length); + + // Check: topTopicAssignments + Tuple3 topicAssignment = model.javaTopicAssignments().first(); + Long docId2 = topicAssignment._1(); + int[] termIndices2 = topicAssignment._2(); + int[] topicIndices2 = topicAssignment._3(); + assertEquals(termIndices2.length, topicIndices2.length); } @Test @@ -160,11 +177,31 @@ public void OnlineOptimizerCompatibility() { assertEquals(roundedLocalTopicSummary.length, k); } - private static int tinyK = LDASuite$.MODULE$.tinyK(); - private static int tinyVocabSize = LDASuite$.MODULE$.tinyVocabSize(); - private static Matrix tinyTopics = LDASuite$.MODULE$.tinyTopics(); + @Test + public void localLdaMethods() { + JavaRDD> docs = sc.parallelize(toyData, 2); + JavaPairRDD pairedDocs = JavaPairRDD.fromJavaRDD(docs); + + // check: topicDistributions + assertEquals(toyModel.topicDistributions(pairedDocs).count(), pairedDocs.count()); + + // check: logPerplexity + double logPerplexity = toyModel.logPerplexity(pairedDocs); + + // check: logLikelihood. + ArrayList> docsSingleWord = new ArrayList>(); + docsSingleWord.add(new Tuple2(0L, Vectors.dense(1.0, 0.0, 0.0))); + JavaPairRDD single = JavaPairRDD.fromJavaRDD(sc.parallelize(docsSingleWord)); + double logLikelihood = toyModel.logLikelihood(single); + } + + private static int tinyK = LDASuite.tinyK(); + private static int tinyVocabSize = LDASuite.tinyVocabSize(); + private static Matrix tinyTopics = LDASuite.tinyTopics(); private static Tuple2[] tinyTopicDescription = - LDASuite$.MODULE$.tinyTopicDescription(); + LDASuite.tinyTopicDescription(); private JavaPairRDD corpus; + private LocalLDAModel toyModel = LDASuite.toyModel(); + private ArrayList> toyData = LDASuite.javaToyData(); } diff --git a/mllib/src/test/java/org/apache/spark/mllib/evaluation/JavaRankingMetricsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/evaluation/JavaRankingMetricsSuite.java index effc8a1a6dab..fa4d334801ce 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/evaluation/JavaRankingMetricsSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/evaluation/JavaRankingMetricsSuite.java @@ -18,12 +18,12 @@ package org.apache.spark.mllib.evaluation; import java.io.Serializable; -import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import scala.Tuple2; import scala.Tuple2$; -import com.google.common.collect.Lists; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -34,18 +34,18 @@ public class JavaRankingMetricsSuite implements Serializable { private transient JavaSparkContext sc; - private transient JavaRDD, ArrayList>> predictionAndLabels; + private transient JavaRDD, List>> predictionAndLabels; @Before public void setUp() { sc = new JavaSparkContext("local", "JavaRankingMetricsSuite"); - predictionAndLabels = sc.parallelize(Lists.newArrayList( + predictionAndLabels = sc.parallelize(Arrays.asList( Tuple2$.MODULE$.apply( - Lists.newArrayList(1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Lists.newArrayList(1, 2, 3, 4, 5)), + Arrays.asList(1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Arrays.asList(1, 2, 3, 4, 5)), Tuple2$.MODULE$.apply( - Lists.newArrayList(4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Lists.newArrayList(1, 2, 3)), + Arrays.asList(4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Arrays.asList(1, 2, 3)), Tuple2$.MODULE$.apply( - Lists.newArrayList(1, 2, 3, 4, 5), Lists.newArrayList())), 2); + Arrays.asList(1, 2, 3, 4, 5), Arrays.asList())), 2); } @After diff --git a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java index b3815ae6039c..d7c2cb3ae206 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java @@ -49,7 +49,7 @@ public void runAssociationRules() { JavaRDD> freqItemsets = sc.parallelize(Lists.newArrayList( new FreqItemset(new String[] {"a"}, 15L), new FreqItemset(new String[] {"b"}, 35L), - new FreqItemset(new String[] {"a", "b"}, 18L) + new FreqItemset(new String[] {"a", "b"}, 12L) )); JavaRDD> results = (new AssociationRules()).run(freqItemsets); diff --git a/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java index 62f7f26b7c98..eb4e3698624b 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java @@ -27,7 +27,12 @@ import static org.junit.Assert.assertEquals; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.stat.test.ChiSqTestResult; +import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; public class JavaStatisticsSuite implements Serializable { private transient JavaSparkContext sc; @@ -53,4 +58,21 @@ public void testCorr() { // Check default method assertEquals(corr1, corr2); } + + @Test + public void kolmogorovSmirnovTest() { + JavaDoubleRDD data = sc.parallelizeDoubles(Lists.newArrayList(0.2, 1.0, -1.0, 2.0)); + KolmogorovSmirnovTestResult testResult1 = Statistics.kolmogorovSmirnovTest(data, "norm"); + KolmogorovSmirnovTestResult testResult2 = Statistics.kolmogorovSmirnovTest( + data, "norm", 0.0, 1.0); + } + + @Test + public void chiSqTest() { + JavaRDD data = sc.parallelize(Lists.newArrayList( + new LabeledPoint(0.0, Vectors.dense(0.1, 2.3)), + new LabeledPoint(1.0, Vectors.dense(1.5, 5.1)), + new LabeledPoint(0.0, Vectors.dense(2.4, 8.1)))); + ChiSqTestResult[] testResults = Statistics.chiSqTest(data); + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala index 63d2fa31c749..1f2c9b75b617 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala @@ -26,6 +26,7 @@ import org.scalatest.mock.MockitoSugar.mock import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.HashingTF import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.sql.DataFrame class PipelineSuite extends SparkFunSuite { @@ -65,6 +66,8 @@ class PipelineSuite extends SparkFunSuite { .setStages(Array(estimator0, transformer1, estimator2, transformer3)) val pipelineModel = pipeline.fit(dataset0) + MLTestingUtils.checkCopy(pipelineModel) + assert(pipelineModel.stages.length === 4) assert(pipelineModel.stages(0).eq(model0)) assert(pipelineModel.stages(1).eq(transformer1)) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala index c7bbf1ce07a2..f680d8d3c4cc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.tree.LeafNode +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree, DecisionTreeSuite => OldDecisionTreeSuite} @@ -244,6 +245,9 @@ class DecisionTreeClassifierSuite extends SparkFunSuite with MLlibTestSparkConte val newData: DataFrame = TreeTests.setMetadata(rdd, categoricalFeatures, numClasses) val newTree = dt.fit(newData) + // copied model must have the same parent. + MLTestingUtils.checkCopy(newTree) + val predictions = newTree.transform(newData) .select(newTree.getPredictionCol, newTree.getRawPredictionCol, newTree.getProbabilityCol) .collect() @@ -257,6 +261,19 @@ class DecisionTreeClassifierSuite extends SparkFunSuite with MLlibTestSparkConte } } + test("training with 1-category categorical feature") { + val data = sc.parallelize(Seq( + LabeledPoint(0, Vectors.dense(0, 2, 3)), + LabeledPoint(1, Vectors.dense(0, 3, 1)), + LabeledPoint(0, Vectors.dense(0, 2, 2)), + LabeledPoint(1, Vectors.dense(0, 3, 9)), + LabeledPoint(0, Vectors.dense(0, 2, 6)) + )) + val df = TreeTests.setMetadata(data, Map(0 -> 1), 2) + val dt = new DecisionTreeClassifier().setMaxDepth(3) + val model = dt.fit(df) + } + ///////////////////////////////////////////////////////////////////////////// // Tests of model save/load ///////////////////////////////////////////////////////////////////////////// diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala index d4b5896c12c0..e3909bccaa5c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala @@ -22,6 +22,7 @@ import org.apache.spark.ml.impl.TreeTests import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.regression.DecisionTreeRegressionModel import org.apache.spark.ml.tree.LeafNode +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{EnsembleTestHelper, GradientBoostedTrees => OldGBT} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo} @@ -92,6 +93,9 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext { .setCheckpointInterval(2) val model = gbt.fit(df) + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) + sc.checkpointDir = None Utils.deleteRecursively(tempDir) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index b7dd44753896..cce39f382f73 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.classification.LogisticRegressionSuite._ import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.MLlibTestSparkContext @@ -91,11 +92,53 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model.hasParent) } + test("setThreshold, getThreshold") { + val lr = new LogisticRegression + // default + assert(lr.getThreshold === 0.5, "LogisticRegression.threshold should default to 0.5") + withClue("LogisticRegression should not have thresholds set by default.") { + intercept[java.util.NoSuchElementException] { // Note: The exception type may change in future + lr.getThresholds + } + } + // Set via threshold. + // Intuition: Large threshold or large thresholds(1) makes class 0 more likely. + lr.setThreshold(1.0) + assert(lr.getThresholds === Array(0.0, 1.0)) + lr.setThreshold(0.0) + assert(lr.getThresholds === Array(1.0, 0.0)) + lr.setThreshold(0.5) + assert(lr.getThresholds === Array(0.5, 0.5)) + // Set via thresholds + val lr2 = new LogisticRegression + lr2.setThresholds(Array(0.3, 0.7)) + val expectedThreshold = 1.0 / (1.0 + 0.3 / 0.7) + assert(lr2.getThreshold ~== expectedThreshold relTol 1E-7) + // thresholds and threshold must be consistent + lr2.setThresholds(Array(0.1, 0.2, 0.3)) + withClue("getThreshold should throw error if thresholds has length != 2.") { + intercept[IllegalArgumentException] { + lr2.getThreshold + } + } + // thresholds and threshold must be consistent: values + withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") { + intercept[IllegalArgumentException] { + val lr2model = lr2.fit(dataset, + lr2.thresholds -> Array(0.3, 0.7), lr2.threshold -> (expectedThreshold / 2.0)) + lr2model.getThreshold + } + } + } + test("logistic regression doesn't fit intercept when fitIntercept is off") { val lr = new LogisticRegression lr.setFitIntercept(false) val model = lr.fit(dataset) assert(model.intercept === 0.0) + + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) } test("logistic regression with setters") { @@ -123,14 +166,16 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { s" ${predAllZero.count(_ === 0)} of ${dataset.count()} were 0.") // Call transform with params, and check that the params worked. val predNotAllZero = - model.transform(dataset, model.threshold -> 0.0, model.probabilityCol -> "myProb") + model.transform(dataset, model.threshold -> 0.0, + model.probabilityCol -> "myProb") .select("prediction", "myProb") .collect() .map { case Row(pred: Double, prob: Vector) => pred } assert(predNotAllZero.exists(_ !== 0.0)) // Call fit() with new params, and check as many params as we can. - val model2 = lr.fit(dataset, lr.maxIter -> 5, lr.regParam -> 0.1, lr.threshold -> 0.4, + lr.setThresholds(Array(0.6, 0.4)) + val model2 = lr.fit(dataset, lr.maxIter -> 5, lr.regParam -> 0.1, lr.probabilityCol -> "theProb") val parent2 = model2.parent.asInstanceOf[LogisticRegression] assert(parent2.getMaxIter === 5) @@ -699,6 +744,41 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val weightsR = Vectors.dense(0.0, 0.0, 0.0, 0.0) assert(model1.intercept ~== interceptR relTol 1E-5) - assert(model1.weights ~= weightsR absTol 1E-6) + assert(model1.weights ~== weightsR absTol 1E-6) + } + + test("evaluate on test set") { + // Evaluate on test set should be same as that of the transformed training data. + val lr = new LogisticRegression() + .setMaxIter(10) + .setRegParam(1.0) + .setThreshold(0.6) + val model = lr.fit(dataset) + val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary] + + val sameSummary = model.evaluate(dataset).asInstanceOf[BinaryLogisticRegressionSummary] + assert(summary.areaUnderROC === sameSummary.areaUnderROC) + assert(summary.roc.collect() === sameSummary.roc.collect()) + assert(summary.pr.collect === sameSummary.pr.collect()) + assert( + summary.fMeasureByThreshold.collect() === sameSummary.fMeasureByThreshold.collect()) + assert(summary.recallByThreshold.collect() === sameSummary.recallByThreshold.collect()) + assert( + summary.precisionByThreshold.collect() === sameSummary.precisionByThreshold.collect()) + } + + test("statistics on training data") { + // Test that loss is monotonically decreasing. + val lr = new LogisticRegression() + .setMaxIter(10) + .setRegParam(1.0) + .setThreshold(0.6) + val model = lr.fit(dataset) + assert( + model.summary + .objectiveHistory + .sliding(2) + .forall(x => x(0) >= x(1))) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala index aea3d9b69449..98bc9511163e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala @@ -21,7 +21,7 @@ import breeze.linalg.{Vector => BV} import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite -import org.apache.spark.mllib.classification.NaiveBayes +import org.apache.spark.mllib.classification.NaiveBayes.{Multinomial, Bernoulli} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ @@ -31,8 +31,6 @@ import org.apache.spark.sql.Row class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext { - import NaiveBayes.{Multinomial, Bernoulli} - def validatePrediction(predictionAndLabels: DataFrame): Unit = { val numOfErrorPredictions = predictionAndLabels.collect().count { case Row(prediction: Double, label: Double) => diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala index 3775292f6dca..977f0e0b70c1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.ml.param.{ParamMap, ParamsSuite} -import org.apache.spark.ml.util.MetadataUtils +import org.apache.spark.ml.util.{MLTestingUtils, MetadataUtils} import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.classification.LogisticRegressionSuite._ import org.apache.spark.mllib.evaluation.MulticlassMetrics @@ -70,6 +70,10 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext { assert(ova.getLabelCol === "label") assert(ova.getPredictionCol === "prediction") val ovaModel = ova.fit(dataset) + + // copied model must have the same parent. + MLTestingUtils.checkCopy(ovaModel) + assert(ovaModel.models.size === numClasses) val transformedDataset = ovaModel.transform(dataset) @@ -151,7 +155,7 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext { require(ovr1.getClassifier.getOrDefault(lr.maxIter) === 10, "copy should handle extra classifier params") - val ovrModel = ovr1.fit(dataset).copy(ParamMap(lr.threshold -> 0.1)) + val ovrModel = ovr1.fit(dataset).copy(ParamMap(lr.thresholds -> Array(0.9, 0.1))) ovrModel.models.foreach { case m: LogisticRegressionModel => require(m.getThreshold === 0.1, "copy should handle extra model params") } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala new file mode 100644 index 000000000000..8f50cb924e64 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.classification + +import org.apache.spark.SparkFunSuite +import org.apache.spark.mllib.linalg.{Vector, Vectors} + +final class TestProbabilisticClassificationModel( + override val uid: String, + override val numClasses: Int) + extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] { + + override def copy(extra: org.apache.spark.ml.param.ParamMap): this.type = defaultCopy(extra) + + override protected def predictRaw(input: Vector): Vector = { + input + } + + override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { + rawPrediction + } + + def friendlyPredict(input: Vector): Double = { + predict(input) + } +} + + +class ProbabilisticClassifierSuite extends SparkFunSuite { + + test("test thresholding") { + val thresholds = Array(0.5, 0.2) + val testModel = new TestProbabilisticClassificationModel("myuid", 2).setThresholds(thresholds) + assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 1.0))) === 1.0) + assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 0.2))) === 0.0) + } + + test("test thresholding not required") { + val testModel = new TestProbabilisticClassificationModel("myuid", 2) + assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 2.0))) === 1.0) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala index edf848b21a90..b4403ec30049 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.tree.LeafNode +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest} @@ -67,7 +68,7 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte test("params") { ParamsSuite.checkParams(new RandomForestClassifier) val model = new RandomForestClassificationModel("rfc", - Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 2)), 2) + Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 2)), 2, 2) ParamsSuite.checkParams(model) } @@ -135,6 +136,9 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte val df: DataFrame = TreeTests.setMetadata(rdd, categoricalFeatures, numClasses) val model = rf.fit(df) + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) + val predictions = model.transform(df) .select(rf.getPredictionCol, rf.getRawPredictionCol, rf.getProbabilityCol) .collect() @@ -149,6 +153,35 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte } } + ///////////////////////////////////////////////////////////////////////////// + // Tests of feature importance + ///////////////////////////////////////////////////////////////////////////// + test("Feature importance with toy data") { + val numClasses = 2 + val rf = new RandomForestClassifier() + .setImpurity("Gini") + .setMaxDepth(3) + .setNumTrees(3) + .setFeatureSubsetStrategy("all") + .setSubsamplingRate(1.0) + .setSeed(123) + + // In this data, feature 1 is very important. + val data: RDD[LabeledPoint] = sc.parallelize(Seq( + new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)), + new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)), + new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)), + new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)), + new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)) + )) + val categoricalFeatures = Map.empty[Int, Int] + val df: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses) + + val importances = rf.fit(df).featureImportances + val mostImportantFeature = importances.argmax + assert(mostImportantFeature === 1) + } + ///////////////////////////////////////////////////////////////////////////// // Tests of model save/load ///////////////////////////////////////////////////////////////////////////// diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala index 1f15ac02f400..688b0e31f91d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala @@ -52,10 +52,9 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext { assert(kmeans.getFeaturesCol === "features") assert(kmeans.getPredictionCol === "prediction") assert(kmeans.getMaxIter === 20) - assert(kmeans.getRuns === 1) assert(kmeans.getInitMode === MLlibKMeans.K_MEANS_PARALLEL) assert(kmeans.getInitSteps === 5) - assert(kmeans.getEpsilon === 1e-4) + assert(kmeans.getTol === 1e-4) } test("set parameters") { @@ -64,21 +63,19 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext { .setFeaturesCol("test_feature") .setPredictionCol("test_prediction") .setMaxIter(33) - .setRuns(7) .setInitMode(MLlibKMeans.RANDOM) .setInitSteps(3) .setSeed(123) - .setEpsilon(1e-3) + .setTol(1e-3) assert(kmeans.getK === 9) assert(kmeans.getFeaturesCol === "test_feature") assert(kmeans.getPredictionCol === "test_prediction") assert(kmeans.getMaxIter === 33) - assert(kmeans.getRuns === 7) assert(kmeans.getInitMode === MLlibKMeans.RANDOM) assert(kmeans.getInitSteps === 3) assert(kmeans.getSeed === 123) - assert(kmeans.getEpsilon === 1e-3) + assert(kmeans.getTol === 1e-3) } test("parameters validation") { @@ -91,9 +88,6 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext { intercept[IllegalArgumentException] { new KMeans().setInitSteps(0) } - intercept[IllegalArgumentException] { - new KMeans().setRuns(0) - } } test("fit & transform") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala index 5b203784559e..aa722da32393 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala @@ -63,7 +63,7 @@ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext // default = rmse val evaluator = new RegressionEvaluator() - assert(evaluator.evaluate(predictions) ~== -0.1019382 absTol 0.001) + assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001) // r2 score evaluator.setMetricName("r2") @@ -71,6 +71,6 @@ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext // mae evaluator.setMetricName("mae") - assert(evaluator.evaluate(predictions) ~== -0.08036075 absTol 0.001) + assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala index ec85e0d151e0..0eba34fda622 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala @@ -21,6 +21,7 @@ import scala.util.Random import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala new file mode 100644 index 000000000000..e192fa4850af --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.feature + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ +import org.apache.spark.sql.Row + +class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext { + + test("params") { + ParamsSuite.checkParams(new CountVectorizerModel(Array("empty"))) + } + + private def split(s: String): Seq[String] = s.split("\\s+") + + test("CountVectorizerModel common cases") { + val df = sqlContext.createDataFrame(Seq( + (0, split("a b c d"), + Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))), + (1, split("a b b c d a"), + Vectors.sparse(4, Seq((0, 2.0), (1, 2.0), (2, 1.0), (3, 1.0)))), + (2, split("a"), Vectors.sparse(4, Seq((0, 1.0)))), + (3, split(""), Vectors.sparse(4, Seq())), // empty string + (4, split("a notInDict d"), + Vectors.sparse(4, Seq((0, 1.0), (3, 1.0)))) // with words not in vocabulary + )).toDF("id", "words", "expected") + val cv = new CountVectorizerModel(Array("a", "b", "c", "d")) + .setInputCol("words") + .setOutputCol("features") + cv.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => + assert(features ~== expected absTol 1e-14) + } + } + + test("CountVectorizer common cases") { + val df = sqlContext.createDataFrame(Seq( + (0, split("a b c d e"), + Vectors.sparse(5, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0)))), + (1, split("a a a a a a"), Vectors.sparse(5, Seq((0, 6.0)))), + (2, split("c"), Vectors.sparse(5, Seq((2, 1.0)))), + (3, split("b b b b b"), Vectors.sparse(5, Seq((1, 5.0))))) + ).toDF("id", "words", "expected") + val cv = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .fit(df) + assert(cv.vocabulary === Array("a", "b", "c", "d", "e")) + + cv.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => + assert(features ~== expected absTol 1e-14) + } + } + + test("CountVectorizer vocabSize and minDF") { + val df = sqlContext.createDataFrame(Seq( + (0, split("a b c d"), Vectors.sparse(3, Seq((0, 1.0), (1, 1.0)))), + (1, split("a b c"), Vectors.sparse(3, Seq((0, 1.0), (1, 1.0)))), + (2, split("a b"), Vectors.sparse(3, Seq((0, 1.0), (1, 1.0)))), + (3, split("a"), Vectors.sparse(3, Seq((0, 1.0))))) + ).toDF("id", "words", "expected") + val cvModel = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setVocabSize(3) // limit vocab size to 3 + .fit(df) + assert(cvModel.vocabulary === Array("a", "b", "c")) + + // minDF: ignore terms with count less than 3 + val cvModel2 = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setMinDF(3) + .fit(df) + assert(cvModel2.vocabulary === Array("a", "b")) + + cvModel2.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => + assert(features ~== expected absTol 1e-14) + } + + // minDF: ignore terms with freq < 0.75 + val cvModel3 = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setMinDF(3.0 / df.count()) + .fit(df) + assert(cvModel3.vocabulary === Array("a", "b")) + + cvModel3.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => + assert(features ~== expected absTol 1e-14) + } + } + + test("CountVectorizer throws exception when vocab is empty") { + intercept[IllegalArgumentException] { + val df = sqlContext.createDataFrame(Seq( + (0, split("a a b b c c")), + (1, split("aa bb cc"))) + ).toDF("id", "words") + val cvModel = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setVocabSize(3) // limit vocab size to 3 + .setMinDF(3) + .fit(df) + } + } + + test("CountVectorizerModel with minTF count") { + val df = sqlContext.createDataFrame(Seq( + (0, split("a a a b b c c c d "), Vectors.sparse(4, Seq((0, 3.0), (2, 3.0)))), + (1, split("c c c c c c"), Vectors.sparse(4, Seq((2, 6.0)))), + (2, split("a"), Vectors.sparse(4, Seq())), + (3, split("e e e e e"), Vectors.sparse(4, Seq()))) + ).toDF("id", "words", "expected") + + // minTF: count + val cv = new CountVectorizerModel(Array("a", "b", "c", "d")) + .setInputCol("words") + .setOutputCol("features") + .setMinTF(3) + cv.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => + assert(features ~== expected absTol 1e-14) + } + } + + test("CountVectorizerModel with minTF freq") { + val df = sqlContext.createDataFrame(Seq( + (0, split("a a a b b c c c d "), Vectors.sparse(4, Seq((0, 3.0), (2, 3.0)))), + (1, split("c c c c c c"), Vectors.sparse(4, Seq((2, 6.0)))), + (2, split("a"), Vectors.sparse(4, Seq((0, 1.0)))), + (3, split("e e e e e"), Vectors.sparse(4, Seq()))) + ).toDF("id", "words", "expected") + + // minTF: count + val cv = new CountVectorizerModel(Array("a", "b", "c", "d")) + .setInputCol("words") + .setOutputCol("features") + .setMinTF(0.3) + cv.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => + assert(features ~== expected absTol 1e-14) + } + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizorSuite.scala deleted file mode 100644 index e90d9d4ef21f..000000000000 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizorSuite.scala +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.ml.feature - -import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.param.ParamsSuite -import org.apache.spark.mllib.linalg.{Vector, Vectors} -import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.mllib.util.TestingUtils._ - -class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext { - - test("params") { - ParamsSuite.checkParams(new CountVectorizerModel(Array("empty"))) - } - - test("CountVectorizerModel common cases") { - val df = sqlContext.createDataFrame(Seq( - (0, "a b c d".split(" ").toSeq, - Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))), - (1, "a b b c d a".split(" ").toSeq, - Vectors.sparse(4, Seq((0, 2.0), (1, 2.0), (2, 1.0), (3, 1.0)))), - (2, "a".split(" ").toSeq, Vectors.sparse(4, Seq((0, 1.0)))), - (3, "".split(" ").toSeq, Vectors.sparse(4, Seq())), // empty string - (4, "a notInDict d".split(" ").toSeq, - Vectors.sparse(4, Seq((0, 1.0), (3, 1.0)))) // with words not in vocabulary - )).toDF("id", "words", "expected") - val cv = new CountVectorizerModel(Array("a", "b", "c", "d")) - .setInputCol("words") - .setOutputCol("features") - val output = cv.transform(df).collect() - output.foreach { p => - val features = p.getAs[Vector]("features") - val expected = p.getAs[Vector]("expected") - assert(features ~== expected absTol 1e-14) - } - } - - test("CountVectorizerModel with minTermFreq") { - val df = sqlContext.createDataFrame(Seq( - (0, "a a a b b c c c d ".split(" ").toSeq, Vectors.sparse(4, Seq((0, 3.0), (2, 3.0)))), - (1, "c c c c c c".split(" ").toSeq, Vectors.sparse(4, Seq((2, 6.0)))), - (2, "a".split(" ").toSeq, Vectors.sparse(4, Seq())), - (3, "e e e e e".split(" ").toSeq, Vectors.sparse(4, Seq()))) - ).toDF("id", "words", "expected") - val cv = new CountVectorizerModel(Array("a", "b", "c", "d")) - .setInputCol("words") - .setOutputCol("features") - .setMinTermFreq(3) - val output = cv.transform(df).collect() - output.foreach { p => - val features = p.getAs[Vector]("features") - val expected = p.getAs[Vector]("expected") - assert(features ~== expected absTol 1e-14) - } - } -} - - diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala index c452054bec92..c04dda41eea3 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SQLContext} @@ -51,6 +52,9 @@ class MinMaxScalerSuite extends SparkFunSuite with MLlibTestSparkContext { .foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1.equals(vector2), "Transformed vector is different with expected.") } + + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) } test("MinMaxScaler arguments max must be larger than min") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala index d0ae36b28c7a..30c500f87a76 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.{Vector, Vectors, DenseMatrix, Matrices} import org.apache.spark.mllib.util.MLlibTestSparkContext @@ -56,6 +57,9 @@ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { .setK(3) .fit(df) + // copied model must have the same parent. + MLTestingUtils.checkCopy(pca) + pca.transform(df).select("pca_features", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala index d0295a0fe2fc..ad008b0a2b51 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala @@ -17,17 +17,23 @@ package org.apache.spark.ml.feature -import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.types.{StringType, StructType, StructField, DoubleType} +import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.{Attribute, NominalAttribute} import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.Row +import org.apache.spark.sql.functions.col class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new StringIndexer) val model = new StringIndexerModel("indexer", Array("a", "b")) + val modelWithoutUid = new StringIndexerModel(Array("a", "b")) ParamsSuite.checkParams(model) + ParamsSuite.checkParams(modelWithoutUid) } test("StringIndexer") { @@ -37,6 +43,10 @@ class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("label") .setOutputCol("labelIndex") .fit(df) + + // copied model must have the same parent. + MLTestingUtils.checkCopy(indexer) + val transformed = indexer.transform(df) val attr = Attribute.fromStructField(transformed.schema("labelIndex")) .asInstanceOf[NominalAttribute] @@ -47,19 +57,6 @@ class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext { // a -> 0, b -> 2, c -> 1 val expected = Set((0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)) assert(output === expected) - // convert reverse our transform - val reversed = indexer.invert("labelIndex", "label2") - .transform(transformed) - .select("id", "label2") - assert(df.collect().map(r => (r.getInt(0), r.getString(1))).toSet === - reversed.collect().map(r => (r.getInt(0), r.getString(1))).toSet) - // Check invert using only metadata - val inverse2 = new StringIndexerInverse() - .setInputCol("labelIndex") - .setOutputCol("label2") - val reversed2 = inverse2.transform(transformed).select("id", "label2") - assert(df.collect().map(r => (r.getInt(0), r.getString(1))).toSet === - reversed2.collect().map(r => (r.getInt(0), r.getString(1))).toSet) } test("StringIndexer with a numeric input column") { @@ -88,4 +85,61 @@ class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext { val df = sqlContext.range(0L, 10L) assert(indexerModel.transform(df).eq(df)) } + + test("IndexToString params") { + val idxToStr = new IndexToString() + ParamsSuite.checkParams(idxToStr) + } + + test("IndexToString.transform") { + val labels = Array("a", "b", "c") + val df0 = sqlContext.createDataFrame(Seq( + (0, "a"), (1, "b"), (2, "c"), (0, "a") + )).toDF("index", "expected") + + val idxToStr0 = new IndexToString() + .setInputCol("index") + .setOutputCol("actual") + .setLabels(labels) + idxToStr0.transform(df0).select("actual", "expected").collect().foreach { + case Row(actual, expected) => + assert(actual === expected) + } + + val attr = NominalAttribute.defaultAttr.withValues(labels) + val df1 = df0.select(col("index").as("indexWithAttr", attr.toMetadata()), col("expected")) + + val idxToStr1 = new IndexToString() + .setInputCol("indexWithAttr") + .setOutputCol("actual") + idxToStr1.transform(df1).select("actual", "expected").collect().foreach { + case Row(actual, expected) => + assert(actual === expected) + } + } + + test("StringIndexer, IndexToString are inverses") { + val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) + val df = sqlContext.createDataFrame(data).toDF("id", "label") + val indexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("labelIndex") + .fit(df) + val transformed = indexer.transform(df) + val idx2str = new IndexToString() + .setInputCol("labelIndex") + .setOutputCol("sameLabel") + .setLabels(indexer.labels) + idx2str.transform(transformed).select("label", "sameLabel").collect().foreach { + case Row(a: String, b: String) => + assert(a === b) + } + } + + test("IndexToString.transformSchema (SPARK-10573)") { + val idxToStr = new IndexToString().setInputCol("input").setOutputCol("output") + val inSchema = StructType(Seq(StructField("input", DoubleType))) + val outSchema = idxToStr.transformSchema(inSchema) + assert(outSchema("output").dataType === StringType) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala index 03120c828ca9..8cb0a2cf14d3 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala @@ -22,6 +22,7 @@ import scala.beans.{BeanInfo, BeanProperty} import org.apache.spark.{Logging, SparkException, SparkFunSuite} import org.apache.spark.ml.attribute._ import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD @@ -109,6 +110,10 @@ class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext with L test("Throws error when given RDDs with different size vectors") { val vectorIndexer = getIndexer val model = vectorIndexer.fit(densePoints1) // vectors of length 3 + + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) + model.transform(densePoints1) // should work model.transform(sparsePoints1) // should work intercept[SparkException] { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSlicerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSlicerSuite.scala new file mode 100644 index 000000000000..a6c2fba8360d --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSlicerSuite.scala @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} +import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, Row, SQLContext} + +class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext { + + test("params") { + val slicer = new VectorSlicer + ParamsSuite.checkParams(slicer) + assert(slicer.getIndices.length === 0) + assert(slicer.getNames.length === 0) + withClue("VectorSlicer should not have any features selected by default") { + intercept[IllegalArgumentException] { + slicer.validateParams() + } + } + } + + test("feature validity checks") { + import VectorSlicer._ + assert(validIndices(Array(0, 1, 8, 2))) + assert(validIndices(Array.empty[Int])) + assert(!validIndices(Array(-1))) + assert(!validIndices(Array(1, 2, 1))) + + assert(validNames(Array("a", "b"))) + assert(validNames(Array.empty[String])) + assert(!validNames(Array("", "b"))) + assert(!validNames(Array("a", "b", "a"))) + } + + test("Test vector slicer") { + val sqlContext = new SQLContext(sc) + + val data = Array( + Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), + Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), + Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), + Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), + Vectors.sparse(5, Seq()) + ) + + // Expected after selecting indices 1, 4 + val expected = Array( + Vectors.sparse(2, Seq((0, 2.3))), + Vectors.dense(2.3, 1.0), + Vectors.dense(0.0, 0.0), + Vectors.dense(-1.1, 3.3), + Vectors.sparse(2, Seq()) + ) + + val defaultAttr = NumericAttribute.defaultAttr + val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) + val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) + + val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) + val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) + + val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } + val df = sqlContext.createDataFrame(rdd, + StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) + + val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") + + def validateResults(df: DataFrame): Unit = { + df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => + assert(vec1 === vec2) + } + val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) + val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) + assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) + resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => + assert(a === b) + } + } + + vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) + validateResults(vectorSlicer.transform(df)) + + vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) + validateResults(vectorSlicer.transform(df)) + + vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) + validateResults(vectorSlicer.transform(df)) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index aa6ce533fd88..a2e46f202995 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ @@ -62,10 +63,75 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { .setSeed(42L) .fit(docDF) + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) + model.transform(docDF).select("result", "expected").collect().foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1 ~== vector2 absTol 1E-5, "Transformed vector is different with expected.") } } + + test("getVectors") { + + val sqlContext = new SQLContext(sc) + import sqlContext.implicits._ + + val sentence = "a b " * 100 + "a c " * 10 + val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" ")) + + val codes = Map( + "a" -> Array(-0.2811822295188904, -0.6356269121170044, -0.3020961284637451), + "b" -> Array(1.0309048891067505, -1.29472815990448, 0.22276712954044342), + "c" -> Array(-0.08456747233867645, 0.5137411952018738, 0.11731560528278351) + ) + val expectedVectors = codes.toSeq.sortBy(_._1).map { case (w, v) => Vectors.dense(v) } + + val docDF = doc.zip(doc).toDF("text", "alsotext") + + val model = new Word2Vec() + .setVectorSize(3) + .setInputCol("text") + .setOutputCol("result") + .setSeed(42L) + .fit(docDF) + + val realVectors = model.getVectors.sort("word").select("vector").map { + case Row(v: Vector) => v + }.collect() + + realVectors.zip(expectedVectors).foreach { + case (real, expected) => + assert(real ~== expected absTol 1E-5, "Actual vector is different from expected.") + } + } + + test("findSynonyms") { + + val sqlContext = new SQLContext(sc) + import sqlContext.implicits._ + + val sentence = "a b " * 100 + "a c " * 10 + val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" ")) + val docDF = doc.zip(doc).toDF("text", "alsotext") + + val model = new Word2Vec() + .setVectorSize(3) + .setInputCol("text") + .setOutputCol("result") + .setSeed(42L) + .fit(docDF) + + val expectedSimilarity = Array(0.2789285076917586, -0.6336972059851644) + val (synonyms, similarity) = model.findSynonyms("a", 2).map { + case Row(w: String, sim: Double) => (w, sim) + }.collect().unzip + + assert(synonyms.toArray === Array("b", "c")) + expectedSimilarity.zip(similarity).map { + case (expected, actual) => assert(math.abs((expected - actual) / expected) < 1E-5) + } + + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/impl/TreeTests.scala b/mllib/src/test/scala/org/apache/spark/ml/impl/TreeTests.scala index 778abcba22c1..460849c79f04 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/impl/TreeTests.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/impl/TreeTests.scala @@ -124,4 +124,22 @@ private[ml] object TreeTests extends SparkFunSuite { "checkEqual failed since the two tree ensembles were not identical") } } + + /** + * Helper method for constructing a tree for testing. + * Given left, right children, construct a parent node. + * @param split Split for parent node + * @return Parent node with children attached + */ + def buildParentNode(left: Node, right: Node, split: Split): Node = { + val leftImp = left.impurityStats + val rightImp = right.impurityStats + val parentImp = leftImp.copy.add(rightImp) + val leftWeight = leftImp.count / parentImp.count.toDouble + val rightWeight = rightImp.count / parentImp.count.toDouble + val gain = parentImp.calculate() - + (leftWeight * leftImp.calculate() + rightWeight * rightImp.calculate()) + val pred = parentImp.predict + new InternalNode(pred, parentImp.calculate(), gain, left, right, split, parentImp) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala index 050d4170ea01..2c878f8372a4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala @@ -199,6 +199,17 @@ class ParamsSuite extends SparkFunSuite { val inArray = ParamValidators.inArray[Int](Array(1, 2)) assert(inArray(1) && inArray(2) && !inArray(0)) + + val arrayLengthGt = ParamValidators.arrayLengthGt[Int](2.0) + assert(arrayLengthGt(Array(0, 1, 2)) && !arrayLengthGt(Array(0, 1))) + } + + test("Params.copyValues") { + val t = new TestParams() + val t2 = t.copy(ParamMap.empty) + assert(!t2.isSet(t2.maxIter)) + val t3 = t.copy(ParamMap(t.maxIter -> 20)) + assert(t3.isSet(t3.maxIter)) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala index 2e5cfe7027eb..eadc80e0e62b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala @@ -28,6 +28,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.{Logging, SparkException, SparkFunSuite} import org.apache.spark.ml.recommendation.ALS._ +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ @@ -374,6 +375,9 @@ class ALSSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { } logInfo(s"Test RMSE is $rmse.") assert(rmse < targetRMSE) + + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) } test("exact rank-1 matrix") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index 33aa9d0d6234..b092bcd6a7e8 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree, DecisionTreeSuite => OldDecisionTreeSuite} @@ -61,6 +62,16 @@ class DecisionTreeRegressorSuite extends SparkFunSuite with MLlibTestSparkContex compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures) } + test("copied model must have the same parent") { + val categoricalFeatures = Map(0 -> 2, 1-> 2) + val df = TreeTests.setMetadata(categoricalDataPointsRDD, categoricalFeatures, numClasses = 0) + val model = new DecisionTreeRegressor() + .setImpurity("variance") + .setMaxDepth(2) + .setMaxBins(8).fit(df) + MLTestingUtils.checkCopy(model) + } + ///////////////////////////////////////////////////////////////////////////// // Tests of model save/load ///////////////////////////////////////////////////////////////////////////// diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala index dbdce0c9dea5..a68197b59193 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{EnsembleTestHelper, GradientBoostedTrees => OldGBT} @@ -82,6 +83,9 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext { .setMaxDepth(2) .setMaxIter(2) val model = gbt.fit(df) + + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) val preds = model.transform(df) val predictions = preds.select("prediction").map(_.getDouble(0)) // Checks based on SPARK-8736 (to ensure it is not doing classification) @@ -104,6 +108,7 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext { sc.checkpointDir = None Utils.deleteRecursively(tempDir) + } // TODO: Reinstate test once runWithValidation is implemented SPARK-7132 diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala index 66e4b170bae8..59f4193abc8f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala @@ -19,57 +19,47 @@ package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.MLTestingUtils +import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row} class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { - private val schema = StructType( - Array( - StructField("label", DoubleType), - StructField("features", DoubleType), - StructField("weight", DoubleType))) - - private val predictionSchema = StructType(Array(StructField("features", DoubleType))) - private def generateIsotonicInput(labels: Seq[Double]): DataFrame = { - val data = Seq.tabulate(labels.size)(i => Row(labels(i), i.toDouble, 1d)) - val parallelData = sc.parallelize(data) - - sqlContext.createDataFrame(parallelData, schema) + sqlContext.createDataFrame( + labels.zipWithIndex.map { case (label, i) => (label, i.toDouble, 1.0) } + ).toDF("label", "features", "weight") } private def generatePredictionInput(features: Seq[Double]): DataFrame = { - val data = Seq.tabulate(features.size)(i => Row(features(i))) - - val parallelData = sc.parallelize(data) - sqlContext.createDataFrame(parallelData, predictionSchema) + sqlContext.createDataFrame(features.map(Tuple1.apply)) + .toDF("features") } test("isotonic regression predictions") { val dataset = generateIsotonicInput(Seq(1, 2, 3, 1, 6, 17, 16, 17, 18)) - val trainer = new IsotonicRegression().setIsotonicParam(true) + val ir = new IsotonicRegression().setIsotonic(true) - val model = trainer.fit(dataset) + val model = ir.fit(dataset) val predictions = model .transform(dataset) - .select("prediction").map { - case Row(pred) => pred + .select("prediction").map { case Row(pred) => + pred }.collect() assert(predictions === Array(1, 2, 2, 2, 6, 16.5, 16.5, 17, 18)) - assert(model.parentModel.boundaries === Array(0, 1, 3, 4, 5, 6, 7, 8)) - assert(model.parentModel.predictions === Array(1, 2, 2, 6, 16.5, 16.5, 17.0, 18.0)) - assert(model.parentModel.isotonic) + assert(model.boundaries === Vectors.dense(0, 1, 3, 4, 5, 6, 7, 8)) + assert(model.predictions === Vectors.dense(1, 2, 2, 6, 16.5, 16.5, 17.0, 18.0)) + assert(model.getIsotonic) } test("antitonic regression predictions") { val dataset = generateIsotonicInput(Seq(7, 5, 3, 5, 1)) - val trainer = new IsotonicRegression().setIsotonicParam(false) + val ir = new IsotonicRegression().setIsotonic(false) - val model = trainer.fit(dataset) + val model = ir.fit(dataset) val features = generatePredictionInput(Seq(-2.0, -1.0, 0.5, 0.75, 1.0, 2.0, 9.0)) val predictions = model @@ -94,32 +84,38 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val ir = new IsotonicRegression() assert(ir.getLabelCol === "label") assert(ir.getFeaturesCol === "features") - assert(ir.getWeightCol === "weight") assert(ir.getPredictionCol === "prediction") - assert(ir.getIsotonicParam === true) + assert(!ir.isDefined(ir.weightCol)) + assert(ir.getIsotonic) + assert(ir.getFeatureIndex === 0) val model = ir.fit(dataset) + + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) + model.transform(dataset) .select("label", "features", "prediction", "weight") .collect() assert(model.getLabelCol === "label") assert(model.getFeaturesCol === "features") - assert(model.getWeightCol === "weight") assert(model.getPredictionCol === "prediction") - assert(model.getIsotonicParam === true) + assert(!model.isDefined(model.weightCol)) + assert(model.getIsotonic) + assert(model.getFeatureIndex === 0) assert(model.hasParent) } test("set parameters") { val isotonicRegression = new IsotonicRegression() - .setIsotonicParam(false) - .setWeightParam("w") + .setIsotonic(false) + .setWeightCol("w") .setFeaturesCol("f") .setLabelCol("l") .setPredictionCol("p") - assert(isotonicRegression.getIsotonicParam === false) + assert(!isotonicRegression.getIsotonic) assert(isotonicRegression.getWeightCol === "w") assert(isotonicRegression.getFeaturesCol === "f") assert(isotonicRegression.getLabelCol === "l") @@ -130,7 +126,7 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val dataset = generateIsotonicInput(Seq(1, 2, 3)) intercept[IllegalArgumentException] { - new IsotonicRegression().setWeightParam("w").fit(dataset) + new IsotonicRegression().setWeightCol("w").fit(dataset) } intercept[IllegalArgumentException] { @@ -145,4 +141,27 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { new IsotonicRegression().fit(dataset).setFeaturesCol("f").transform(dataset) } } + + test("vector features column with feature index") { + val dataset = sqlContext.createDataFrame(Seq( + (4.0, Vectors.dense(0.0, 1.0)), + (3.0, Vectors.dense(0.0, 2.0)), + (5.0, Vectors.sparse(2, Array(1), Array(3.0)))) + ).toDF("label", "features") + + val ir = new IsotonicRegression() + .setFeatureIndex(1) + + val model = ir.fit(dataset) + + val features = generatePredictionInput(Seq(2.0, 3.0, 4.0, 5.0)) + + val predictions = model + .transform(features) + .select("prediction").map { + case Row(pred) => pred + }.collect() + + assert(predictions === Array(3.5, 5.0, 5.0, 5.0)) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index 7cdda3db88ad..2aaee71ecc73 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.{DenseVector, Vectors} import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext} import org.apache.spark.mllib.util.TestingUtils._ @@ -70,7 +71,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(lir.getRegParam === 0.0) assert(lir.getElasticNetParam === 0.0) assert(lir.getFitIntercept) + assert(lir.getStandardization) val model = lir.fit(dataset) + + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) + model.transform(dataset) .select("label", "prediction") .collect() @@ -81,8 +87,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { } test("linear regression with intercept without regularization") { - val trainer = new LinearRegression - val model = trainer.fit(dataset) + val trainer1 = new LinearRegression + // The result should be the same regardless of standardization without regularization + val trainer2 = (new LinearRegression).setStandardization(false) + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* Using the following R code to load the data and train the model using glmnet package. @@ -95,28 +104,36 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { > weights 3 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 6.300528 - as.numeric.data.V2. 4.701024 - as.numeric.data.V3. 7.198257 + (Intercept) 6.298698 + as.numeric.data.V2. 4.700706 + as.numeric.data.V3. 7.199082 */ val interceptR = 6.298698 val weightsR = Vectors.dense(4.700706, 7.199082) - assert(model.intercept ~== interceptR relTol 1E-3) - assert(model.weights ~= weightsR relTol 1E-3) + assert(model1.intercept ~== interceptR relTol 1E-3) + assert(model1.weights ~= weightsR relTol 1E-3) + assert(model2.intercept ~== interceptR relTol 1E-3) + assert(model2.weights ~= weightsR relTol 1E-3) - model.transform(dataset).select("features", "prediction").collect().foreach { + + model1.transform(dataset).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = - features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept + features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } test("linear regression without intercept without regularization") { - val trainer = (new LinearRegression).setFitIntercept(false) - val model = trainer.fit(dataset) - val modelWithoutIntercept = trainer.fit(datasetWithoutIntercept) + val trainer1 = (new LinearRegression).setFitIntercept(false) + // Without regularization the results should be the same + val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false) + val model1 = trainer1.fit(dataset) + val modelWithoutIntercept1 = trainer1.fit(datasetWithoutIntercept) + val model2 = trainer2.fit(dataset) + val modelWithoutIntercept2 = trainer2.fit(datasetWithoutIntercept) + /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0, @@ -130,26 +147,34 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { */ val weightsR = Vectors.dense(6.995908, 5.275131) - assert(model.intercept ~== 0 absTol 1E-3) - assert(model.weights ~= weightsR relTol 1E-3) + assert(model1.intercept ~== 0 absTol 1E-3) + assert(model1.weights ~= weightsR relTol 1E-3) + assert(model2.intercept ~== 0 absTol 1E-3) + assert(model2.weights ~= weightsR relTol 1E-3) + /* Then again with the data with no intercept: > weightsWithoutIntercept 3 x 1 sparse Matrix of class "dgCMatrix" - s0 + s0 (Intercept) . as.numeric.data3.V2. 4.70011 as.numeric.data3.V3. 7.19943 */ val weightsWithoutInterceptR = Vectors.dense(4.70011, 7.19943) - assert(modelWithoutIntercept.intercept ~== 0 absTol 1E-3) - assert(modelWithoutIntercept.weights ~= weightsWithoutInterceptR relTol 1E-3) + assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3) + assert(modelWithoutIntercept1.weights ~= weightsWithoutInterceptR relTol 1E-3) + assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3) + assert(modelWithoutIntercept2.weights ~= weightsWithoutInterceptR relTol 1E-3) } test("linear regression with intercept with L1 regularization") { - val trainer = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) - val model = trainer.fit(dataset) + val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) + val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) + .setStandardization(false) + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57)) @@ -160,24 +185,44 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { as.numeric.data.V2. 4.024821 as.numeric.data.V3. 6.679841 */ - val interceptR = 6.24300 - val weightsR = Vectors.dense(4.024821, 6.679841) + val interceptR1 = 6.24300 + val weightsR1 = Vectors.dense(4.024821, 6.679841) + + assert(model1.intercept ~== interceptR1 relTol 1E-3) + assert(model1.weights ~= weightsR1 relTol 1E-3) + + /* + weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57, + standardize=FALSE)) + > weights + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 6.416948 + as.numeric.data.V2. 3.893869 + as.numeric.data.V3. 6.724286 + */ + val interceptR2 = 6.416948 + val weightsR2 = Vectors.dense(3.893869, 6.724286) + + assert(model2.intercept ~== interceptR2 relTol 1E-3) + assert(model2.weights ~= weightsR2 relTol 1E-3) - assert(model.intercept ~== interceptR relTol 1E-3) - assert(model.weights ~= weightsR relTol 1E-3) - model.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(dataset).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = - features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept + features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } test("linear regression without intercept with L1 regularization") { - val trainer = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) + val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) .setFitIntercept(false) - val model = trainer.fit(dataset) + val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) + .setFitIntercept(false).setStandardization(false) + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57, @@ -189,51 +234,90 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { as.numeric.data.V2. 6.299752 as.numeric.data.V3. 4.772913 */ - val interceptR = 0.0 - val weightsR = Vectors.dense(6.299752, 4.772913) + val interceptR1 = 0.0 + val weightsR1 = Vectors.dense(6.299752, 4.772913) + + assert(model1.intercept ~== interceptR1 absTol 1E-3) + assert(model1.weights ~= weightsR1 relTol 1E-3) + + /* + weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57, + intercept=FALSE, standardize=FALSE)) + > weights + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + as.numeric.data.V2. 6.232193 + as.numeric.data.V3. 4.764229 + */ + val interceptR2 = 0.0 + val weightsR2 = Vectors.dense(6.232193, 4.764229) - assert(model.intercept ~== interceptR absTol 1E-5) - assert(model.weights ~= weightsR relTol 1E-3) + assert(model2.intercept ~== interceptR2 absTol 1E-3) + assert(model2.weights ~= weightsR2 relTol 1E-3) - model.transform(dataset).select("features", "prediction").collect().foreach { + + model1.transform(dataset).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = - features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept + features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } test("linear regression with intercept with L2 regularization") { - val trainer = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) - val model = trainer.fit(dataset) + val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) + val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) + .setStandardization(false) + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* - weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3)) - > weights - 3 x 1 sparse Matrix of class "dgCMatrix" - s0 - (Intercept) 6.328062 - as.numeric.data.V2. 3.222034 - as.numeric.data.V3. 4.926260 + weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3)) + > weights + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 5.269376 + as.numeric.data.V2. 3.736216 + as.numeric.data.V3. 5.712356) */ - val interceptR = 5.269376 - val weightsR = Vectors.dense(3.736216, 5.712356) + val interceptR1 = 5.269376 + val weightsR1 = Vectors.dense(3.736216, 5.712356) - assert(model.intercept ~== interceptR relTol 1E-3) - assert(model.weights ~= weightsR relTol 1E-3) + assert(model1.intercept ~== interceptR1 relTol 1E-3) + assert(model1.weights ~= weightsR1 relTol 1E-3) - model.transform(dataset).select("features", "prediction").collect().foreach { + /* + weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, + standardize=FALSE)) + > weights + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 5.791109 + as.numeric.data.V2. 3.435466 + as.numeric.data.V3. 5.910406 + */ + val interceptR2 = 5.791109 + val weightsR2 = Vectors.dense(3.435466, 5.910406) + + assert(model2.intercept ~== interceptR2 relTol 1E-3) + assert(model2.weights ~= weightsR2 relTol 1E-3) + + model1.transform(dataset).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = - features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept + features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } test("linear regression without intercept with L2 regularization") { - val trainer = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) + val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) .setFitIntercept(false) - val model = trainer.fit(dataset) + val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) + .setFitIntercept(false).setStandardization(false) + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, @@ -245,23 +329,42 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { as.numeric.data.V2. 5.522875 as.numeric.data.V3. 4.214502 */ - val interceptR = 0.0 - val weightsR = Vectors.dense(5.522875, 4.214502) + val interceptR1 = 0.0 + val weightsR1 = Vectors.dense(5.522875, 4.214502) + + assert(model1.intercept ~== interceptR1 absTol 1E-3) + assert(model1.weights ~= weightsR1 relTol 1E-3) + + /* + weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, + intercept = FALSE, standardize=FALSE)) + > weights + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + as.numeric.data.V2. 5.263704 + as.numeric.data.V3. 4.187419 + */ + val interceptR2 = 0.0 + val weightsR2 = Vectors.dense(5.263704, 4.187419) - assert(model.intercept ~== interceptR absTol 1E-3) - assert(model.weights ~== weightsR relTol 1E-3) + assert(model2.intercept ~== interceptR2 absTol 1E-3) + assert(model2.weights ~= weightsR2 relTol 1E-3) - model.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(dataset).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = - features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept + features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } test("linear regression with intercept with ElasticNet regularization") { - val trainer = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) - val model = trainer.fit(dataset) + val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) + val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) + .setStandardization(false) + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6)) @@ -272,24 +375,43 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { as.numeric.data.V2. 3.168435 as.numeric.data.V3. 5.200403 */ - val interceptR = 5.696056 - val weightsR = Vectors.dense(3.670489, 6.001122) + val interceptR1 = 5.696056 + val weightsR1 = Vectors.dense(3.670489, 6.001122) - assert(model.intercept ~== interceptR relTol 1E-3) - assert(model.weights ~== weightsR relTol 1E-3) + assert(model1.intercept ~== interceptR1 relTol 1E-3) + assert(model1.weights ~= weightsR1 relTol 1E-3) - model.transform(dataset).select("features", "prediction").collect().foreach { + /* + weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6 + standardize=FALSE)) + > weights + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 6.114723 + as.numeric.data.V2. 3.409937 + as.numeric.data.V3. 6.146531 + */ + val interceptR2 = 6.114723 + val weightsR2 = Vectors.dense(3.409937, 6.146531) + + assert(model2.intercept ~== interceptR2 relTol 1E-3) + assert(model2.weights ~= weightsR2 relTol 1E-3) + + model1.transform(dataset).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = - features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept + features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } test("linear regression without intercept with ElasticNet regularization") { - val trainer = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) + val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) .setFitIntercept(false) - val model = trainer.fit(dataset) + val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) + .setFitIntercept(false).setStandardization(false) + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6, @@ -301,16 +423,32 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { as.numeric.dataM.V2. 5.673348 as.numeric.dataM.V3. 4.322251 */ - val interceptR = 0.0 - val weightsR = Vectors.dense(5.673348, 4.322251) + val interceptR1 = 0.0 + val weightsR1 = Vectors.dense(5.673348, 4.322251) - assert(model.intercept ~== interceptR absTol 1E-3) - assert(model.weights ~= weightsR relTol 1E-3) + assert(model1.intercept ~== interceptR1 absTol 1E-3) + assert(model1.weights ~= weightsR1 relTol 1E-3) - model.transform(dataset).select("features", "prediction").collect().foreach { + /* + weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6, + intercept=FALSE, standardize=FALSE)) + > weights + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + as.numeric.data.V2. 5.477988 + as.numeric.data.V3. 4.297622 + */ + val interceptR2 = 0.0 + val weightsR2 = Vectors.dense(5.477988, 4.297622) + + assert(model2.intercept ~== interceptR2 absTol 1E-3) + assert(model2.weights ~= weightsR2 relTol 1E-3) + + model1.transform(dataset).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = - features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept + features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } @@ -372,5 +510,4 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { .zip(testSummary.residuals.select("residuals").collect()) .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 } } - } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala index b24ecaa57c89..7b1b3f11481d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala @@ -19,6 +19,8 @@ package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests +import org.apache.spark.ml.util.MLTestingUtils +import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo} @@ -26,7 +28,6 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame - /** * Test suite for [[RandomForestRegressor]]. */ @@ -71,6 +72,35 @@ class RandomForestRegressorSuite extends SparkFunSuite with MLlibTestSparkContex regressionTestWithContinuousFeatures(rf) } + test("Feature importance with toy data") { + val rf = new RandomForestRegressor() + .setImpurity("variance") + .setMaxDepth(3) + .setNumTrees(3) + .setFeatureSubsetStrategy("all") + .setSubsamplingRate(1.0) + .setSeed(123) + + // In this data, feature 1 is very important. + val data: RDD[LabeledPoint] = sc.parallelize(Seq( + new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)), + new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)), + new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)), + new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)), + new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)) + )) + val categoricalFeatures = Map.empty[Int, Int] + val df: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, 0) + + val model = rf.fit(df) + + // copied model must have the same parent. + MLTestingUtils.checkCopy(model) + val importances = model.featureImportances + val mostImportantFeature = importances.argmax + assert(mostImportantFeature === 1) + } + ///////////////////////////////////////////////////////////////////////////// // Tests of model save/load ///////////////////////////////////////////////////////////////////////////// diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala new file mode 100644 index 000000000000..dc852795c7f6 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.tree.impl + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.classification.DecisionTreeClassificationModel +import org.apache.spark.ml.impl.TreeTests +import org.apache.spark.ml.tree.{ContinuousSplit, DecisionTreeModel, LeafNode, Node} +import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.tree.impurity.GiniCalculator +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ +import org.apache.spark.util.collection.OpenHashMap + +/** + * Test suite for [[RandomForest]]. + */ +class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { + + import RandomForestSuite.mapToVec + + test("computeFeatureImportance, featureImportances") { + /* Build tree for testing, with this structure: + grandParent + left2 parent + left right + */ + val leftImp = new GiniCalculator(Array(3.0, 2.0, 1.0)) + val left = new LeafNode(0.0, leftImp.calculate(), leftImp) + + val rightImp = new GiniCalculator(Array(1.0, 2.0, 5.0)) + val right = new LeafNode(2.0, rightImp.calculate(), rightImp) + + val parent = TreeTests.buildParentNode(left, right, new ContinuousSplit(0, 0.5)) + val parentImp = parent.impurityStats + + val left2Imp = new GiniCalculator(Array(1.0, 6.0, 1.0)) + val left2 = new LeafNode(0.0, left2Imp.calculate(), left2Imp) + + val grandParent = TreeTests.buildParentNode(left2, parent, new ContinuousSplit(1, 1.0)) + val grandImp = grandParent.impurityStats + + // Test feature importance computed at different subtrees. + def testNode(node: Node, expected: Map[Int, Double]): Unit = { + val map = new OpenHashMap[Int, Double]() + RandomForest.computeFeatureImportance(node, map) + assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) + } + + // Leaf node + testNode(left, Map.empty[Int, Double]) + + // Internal node with 2 leaf children + val feature0importance = parentImp.calculate() * parentImp.count - + (leftImp.calculate() * leftImp.count + rightImp.calculate() * rightImp.count) + testNode(parent, Map(0 -> feature0importance)) + + // Full tree + val feature1importance = grandImp.calculate() * grandImp.count - + (left2Imp.calculate() * left2Imp.count + parentImp.calculate() * parentImp.count) + testNode(grandParent, Map(0 -> feature0importance, 1 -> feature1importance)) + + // Forest consisting of (full tree) + (internal node with 2 leafs) + val trees = Array(parent, grandParent).map { root => + new DecisionTreeClassificationModel(root, numClasses = 3).asInstanceOf[DecisionTreeModel] + } + val importances: Vector = RandomForest.featureImportances(trees, 2) + val tree2norm = feature0importance + feature1importance + val expected = Vectors.dense((1.0 + feature0importance / tree2norm) / 2.0, + (feature1importance / tree2norm) / 2.0) + assert(importances ~== expected relTol 0.01) + } + + test("normalizeMapValues") { + val map = new OpenHashMap[Int, Double]() + map(0) = 1.0 + map(2) = 2.0 + RandomForest.normalizeMapValues(map) + val expected = Map(0 -> 1.0 / 3.0, 2 -> 2.0 / 3.0) + assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) + } + +} + +private object RandomForestSuite { + + def mapToVec(map: Map[Int, Double]): Vector = { + val size = (map.keys.toSeq :+ 0).max + 1 + val (indices, values) = map.toSeq.sortBy(_._1).unzip + Vectors.sparse(size, indices.toArray, values.toArray) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala index db64511a7605..fde02e0c84bc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.ml.tuning import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator} @@ -53,6 +54,10 @@ class CrossValidatorSuite extends SparkFunSuite with MLlibTestSparkContext { .setEvaluator(eval) .setNumFolds(3) val cvModel = cv.fit(dataset) + + // copied model must have the same paren. + MLTestingUtils.checkCopy(cvModel) + val parent = cvModel.bestModel.parent.asInstanceOf[LogisticRegression] assert(parent.getRegParam === 0.001) assert(parent.getMaxIter === 10) @@ -138,6 +143,8 @@ object CrossValidatorSuite { throw new UnsupportedOperationException } + override def isLargerBetter: Boolean = true + override val uid: String = "eval" override def copy(extra: ParamMap): MyEvaluator = defaultCopy(extra) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala index c8e58f216cce..ef24e6fb6b80 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala @@ -132,6 +132,8 @@ object TrainValidationSplitSuite { throw new UnsupportedOperationException } + override def isLargerBetter: Boolean = true + override val uid: String = "eval" override def copy(extra: ParamMap): MyEvaluator = defaultCopy(extra) diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala new file mode 100644 index 000000000000..d290cc9b06e7 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.util + +import org.apache.spark.ml.Model +import org.apache.spark.ml.param.ParamMap + +object MLTestingUtils { + def checkCopy(model: Model[_]): Unit = { + val copied = model.copy(ParamMap.empty) + .asInstanceOf[Model[_]] + assert(copied.parent.uid == model.parent.uid) + assert(copied.parent == model.parent) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala index b218d72f1268..a72723eb00da 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.mllib.clustering import org.apache.spark.SparkFunSuite -import org.apache.spark.mllib.linalg.{Vectors, Matrices} +import org.apache.spark.mllib.linalg.{Vector, Vectors, Matrices} import org.apache.spark.mllib.stat.distribution.MultivariateGaussian import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ @@ -76,6 +76,20 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext { assert(gmm.gaussians(1).sigma ~== Esigma(1) absTol 1E-3) } + test("two clusters with distributed decompositions") { + val data = sc.parallelize(GaussianTestData.data2, 2) + + val k = 5 + val d = data.first().size + assert(GaussianMixture.shouldDistributeGaussians(k, d)) + + val gmm = new GaussianMixture() + .setK(k) + .run(data) + + assert(gmm.k === k) + } + test("single cluster with sparse data") { val data = sc.parallelize(Array( Vectors.sparse(3, Array(0, 2), Array(4.0, 2.0)), @@ -116,7 +130,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext { val sparseGMM = new GaussianMixture() .setK(2) .setInitialModel(initialGmm) - .run(data) + .run(sparseData) assert(sparseGMM.weights(0) ~== Ew(0) absTol 1E-3) assert(sparseGMM.weights(1) ~== Ew(1) absTol 1E-3) @@ -148,6 +162,16 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext { } } + test("model prediction, parallel and local") { + val data = sc.parallelize(GaussianTestData.data) + val gmm = new GaussianMixture().setK(2).setSeed(0).run(data) + + val batchPredictions = gmm.predict(data) + batchPredictions.zip(data).collect().foreach { case (batchPred, datum) => + assert(batchPred === gmm.predict(datum)) + } + } + object GaussianTestData { val data = Array( @@ -158,5 +182,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext { Vectors.dense( 4.5605), Vectors.dense( 5.2043), Vectors.dense( 6.2734) ) + val data2: Array[Vector] = Array.tabulate(25){ i: Int => + Vectors.dense(Array.tabulate(50)(i + _.toDouble)) + } + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index fdc2554ab853..37fb69d68f6b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.mllib.clustering +import java.util.{ArrayList => JArrayList} + import breeze.linalg.{DenseMatrix => BDM, argtopk, max, argmax} import org.apache.spark.SparkFunSuite @@ -66,6 +68,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { // Train a model val lda = new LDA() lda.setK(k) + .setOptimizer(new EMLDAOptimizer) .setDocConcentration(topicSmoothing) .setTopicConcentration(termSmoothing) .setMaxIterations(5) @@ -133,17 +136,34 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { } // Top 3 documents per topic - model.topDocumentsPerTopic(3).zip(topDocsByTopicDistributions(3)).foreach {case (t1, t2) => + model.topDocumentsPerTopic(3).zip(topDocsByTopicDistributions(3)).foreach { case (t1, t2) => assert(t1._1 === t2._1) assert(t1._2 === t2._2) } // All documents per topic val q = tinyCorpus.length - model.topDocumentsPerTopic(q).zip(topDocsByTopicDistributions(q)).foreach {case (t1, t2) => + model.topDocumentsPerTopic(q).zip(topDocsByTopicDistributions(q)).foreach { case (t1, t2) => assert(t1._1 === t2._1) assert(t1._2 === t2._2) } + + // Check: topTopicAssignments + // Make sure it assigns a topic to each term appearing in each doc. + val topTopicAssignments: Map[Long, (Array[Int], Array[Int])] = + model.topicAssignments.collect().map(x => x._1 -> (x._2, x._3)).toMap + assert(topTopicAssignments.keys.max < tinyCorpus.length) + tinyCorpus.foreach { case (docID: Long, doc: Vector) => + if (topTopicAssignments.contains(docID)) { + val (inds, vals) = topTopicAssignments(docID) + assert(inds.length === doc.numNonzeros) + // For "term" in actual doc, + // check that it has a topic assigned. + doc.foreachActive((term, wcnt) => assert(wcnt === 0 || inds.contains(term))) + } else { + assert(doc.numNonzeros === 0) + } + } } test("vertex indexing") { @@ -160,8 +180,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { test("setter alias") { val lda = new LDA().setAlpha(2.0).setBeta(3.0) - assert(lda.getAlpha.toArray.forall(_ === 2.0)) - assert(lda.getDocConcentration.toArray.forall(_ === 2.0)) + assert(lda.getAsymmetricAlpha.toArray.forall(_ === 2.0)) + assert(lda.getAsymmetricDocConcentration.toArray.forall(_ === 2.0)) assert(lda.getBeta === 3.0) assert(lda.getTopicConcentration === 3.0) } @@ -404,7 +424,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { val k = 2 val docs = sc.parallelize(toyData) val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51) - .setGammaShape(100).setOptimzeAlpha(true).setSampleWithReplacement(false) + .setGammaShape(100).setOptimizeDocConcentration(true).setSampleWithReplacement(false) val lda = new LDA().setK(k) .setDocConcentration(1D / k) .setTopicConcentration(0.01) @@ -575,6 +595,17 @@ private[clustering] object LDASuite { Vectors.sparse(6, Array(4, 5), Array(1, 1)) ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) } + /** Used in the Java Test Suite */ + def javaToyData: JArrayList[(java.lang.Long, Vector)] = { + val javaData = new JArrayList[(java.lang.Long, Vector)] + var i = 0 + while (i < toyData.length) { + javaData.add((toyData(i)._1, toyData(i)._2)) + i += 1 + } + javaData + } + def toyModel: LocalLDAModel = { val k = 2 val vocabSize = 6 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala index 0ae48d62cc6b..a83e543859b8 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { - test("PrefixSpan internal (integer seq, -1 delim) run, singleton itemsets") { + test("PrefixSpan internal (integer seq, 0 delim) run, singleton itemsets") { /* library("arulesSequences") @@ -35,83 +35,81 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { */ val sequences = Array( - Array(1, -1, 3, -1, 4, -1, 5), - Array(2, -1, 3, -1, 1), - Array(2, -1, 4, -1, 1), - Array(3, -1, 1, -1, 3, -1, 4, -1, 5), - Array(3, -1, 4, -1, 4, -1, 3), - Array(6, -1, 5, -1, 3)) + Array(0, 1, 0, 3, 0, 4, 0, 5, 0), + Array(0, 2, 0, 3, 0, 1, 0), + Array(0, 2, 0, 4, 0, 1, 0), + Array(0, 3, 0, 1, 0, 3, 0, 4, 0, 5, 0), + Array(0, 3, 0, 4, 0, 4, 0, 3, 0), + Array(0, 6, 0, 5, 0, 3, 0)) val rdd = sc.parallelize(sequences, 2).cache() - val prefixspan = new PrefixSpan() - .setMinSupport(0.33) - .setMaxPatternLength(50) - val result1 = prefixspan.run(rdd) + val result1 = PrefixSpan.genFreqPatterns( + rdd, minCount = 2L, maxPatternLength = 50, maxLocalProjDBSize = 16L) val expectedValue1 = Array( - (Array(1), 4L), - (Array(1, -1, 3), 2L), - (Array(1, -1, 3, -1, 4), 2L), - (Array(1, -1, 3, -1, 4, -1, 5), 2L), - (Array(1, -1, 3, -1, 5), 2L), - (Array(1, -1, 4), 2L), - (Array(1, -1, 4, -1, 5), 2L), - (Array(1, -1, 5), 2L), - (Array(2), 2L), - (Array(2, -1, 1), 2L), - (Array(3), 5L), - (Array(3, -1, 1), 2L), - (Array(3, -1, 3), 2L), - (Array(3, -1, 4), 3L), - (Array(3, -1, 4, -1, 5), 2L), - (Array(3, -1, 5), 2L), - (Array(4), 4L), - (Array(4, -1, 5), 2L), - (Array(5), 3L) + (Array(0, 1, 0), 4L), + (Array(0, 1, 0, 3, 0), 2L), + (Array(0, 1, 0, 3, 0, 4, 0), 2L), + (Array(0, 1, 0, 3, 0, 4, 0, 5, 0), 2L), + (Array(0, 1, 0, 3, 0, 5, 0), 2L), + (Array(0, 1, 0, 4, 0), 2L), + (Array(0, 1, 0, 4, 0, 5, 0), 2L), + (Array(0, 1, 0, 5, 0), 2L), + (Array(0, 2, 0), 2L), + (Array(0, 2, 0, 1, 0), 2L), + (Array(0, 3, 0), 5L), + (Array(0, 3, 0, 1, 0), 2L), + (Array(0, 3, 0, 3, 0), 2L), + (Array(0, 3, 0, 4, 0), 3L), + (Array(0, 3, 0, 4, 0, 5, 0), 2L), + (Array(0, 3, 0, 5, 0), 2L), + (Array(0, 4, 0), 4L), + (Array(0, 4, 0, 5, 0), 2L), + (Array(0, 5, 0), 3L) ) compareInternalResults(expectedValue1, result1.collect()) - prefixspan.setMinSupport(0.5).setMaxPatternLength(50) - val result2 = prefixspan.run(rdd) + val result2 = PrefixSpan.genFreqPatterns( + rdd, minCount = 3, maxPatternLength = 50, maxLocalProjDBSize = 32L) val expectedValue2 = Array( - (Array(1), 4L), - (Array(3), 5L), - (Array(3, -1, 4), 3L), - (Array(4), 4L), - (Array(5), 3L) + (Array(0, 1, 0), 4L), + (Array(0, 3, 0), 5L), + (Array(0, 3, 0, 4, 0), 3L), + (Array(0, 4, 0), 4L), + (Array(0, 5, 0), 3L) ) compareInternalResults(expectedValue2, result2.collect()) - prefixspan.setMinSupport(0.33).setMaxPatternLength(2) - val result3 = prefixspan.run(rdd) + val result3 = PrefixSpan.genFreqPatterns( + rdd, minCount = 2, maxPatternLength = 2, maxLocalProjDBSize = 32L) val expectedValue3 = Array( - (Array(1), 4L), - (Array(1, -1, 3), 2L), - (Array(1, -1, 4), 2L), - (Array(1, -1, 5), 2L), - (Array(2, -1, 1), 2L), - (Array(2), 2L), - (Array(3), 5L), - (Array(3, -1, 1), 2L), - (Array(3, -1, 3), 2L), - (Array(3, -1, 4), 3L), - (Array(3, -1, 5), 2L), - (Array(4), 4L), - (Array(4, -1, 5), 2L), - (Array(5), 3L) + (Array(0, 1, 0), 4L), + (Array(0, 1, 0, 3, 0), 2L), + (Array(0, 1, 0, 4, 0), 2L), + (Array(0, 1, 0, 5, 0), 2L), + (Array(0, 2, 0, 1, 0), 2L), + (Array(0, 2, 0), 2L), + (Array(0, 3, 0), 5L), + (Array(0, 3, 0, 1, 0), 2L), + (Array(0, 3, 0, 3, 0), 2L), + (Array(0, 3, 0, 4, 0), 3L), + (Array(0, 3, 0, 5, 0), 2L), + (Array(0, 4, 0), 4L), + (Array(0, 4, 0, 5, 0), 2L), + (Array(0, 5, 0), 3L) ) compareInternalResults(expectedValue3, result3.collect()) } test("PrefixSpan internal (integer seq, -1 delim) run, variable-size itemsets") { val sequences = Array( - Array(1, -1, 1, 2, 3, -1, 1, 3, -1, 4, -1, 3, 6), - Array(1, 4, -1, 3, -1, 2, 3, -1, 1, 5), - Array(5, 6, -1, 1, 2, -1, 4, 6, -1, 3, -1, 2), - Array(5, -1, 7, -1, 1, 6, -1, 3, -1, 2, -1, 3)) + Array(0, 1, 0, 1, 2, 3, 0, 1, 3, 0, 4, 0, 3, 6, 0), + Array(0, 1, 4, 0, 3, 0, 2, 3, 0, 1, 5, 0), + Array(0, 5, 6, 0, 1, 2, 0, 4, 6, 0, 3, 0, 2, 0), + Array(0, 5, 0, 7, 0, 1, 6, 0, 3, 0, 2, 0, 3, 0)) val rdd = sc.parallelize(sequences, 2).cache() - val prefixspan = new PrefixSpan().setMinSupport(0.5).setMaxPatternLength(5) - val result = prefixspan.run(rdd) + val result = PrefixSpan.genFreqPatterns( + rdd, minCount = 2, maxPatternLength = 5, maxLocalProjDBSize = 128L) /* To verify results, create file "prefixSpanSeqs" with content @@ -200,63 +198,87 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { 53 <{1},{2},{1}> 0.50 */ val expectedValue = Array( - (Array(1), 4L), - (Array(2), 4L), - (Array(3), 4L), - (Array(4), 3L), - (Array(5), 3L), - (Array(6), 3L), - (Array(1, -1, 6), 2L), - (Array(2, -1, 6), 2L), - (Array(5, -1, 6), 2L), - (Array(1, 2, -1, 6), 2L), - (Array(1, -1, 4), 2L), - (Array(2, -1, 4), 2L), - (Array(1, 2, -1, 4), 2L), - (Array(1, -1, 3), 4L), - (Array(2, -1, 3), 3L), - (Array(2, 3), 2L), - (Array(3, -1, 3), 3L), - (Array(4, -1, 3), 3L), - (Array(5, -1, 3), 2L), - (Array(6, -1, 3), 2L), - (Array(5, -1, 6, -1, 3), 2L), - (Array(6, -1, 2, -1, 3), 2L), - (Array(5, -1, 2, -1, 3), 2L), - (Array(5, -1, 1, -1, 3), 2L), - (Array(2, -1, 4, -1, 3), 2L), - (Array(1, -1, 4, -1, 3), 2L), - (Array(1, 2, -1, 4, -1, 3), 2L), - (Array(1, -1, 3, -1, 3), 3L), - (Array(1, 2, -1, 3), 2L), - (Array(1, -1, 2, -1, 3), 2L), - (Array(1, -1, 2, 3), 2L), - (Array(1, -1, 2), 4L), - (Array(1, 2), 2L), - (Array(3, -1, 2), 3L), - (Array(4, -1, 2), 2L), - (Array(5, -1, 2), 2L), - (Array(6, -1, 2), 2L), - (Array(5, -1, 6, -1, 2), 2L), - (Array(6, -1, 3, -1, 2), 2L), - (Array(5, -1, 3, -1, 2), 2L), - (Array(5, -1, 1, -1, 2), 2L), - (Array(4, -1, 3, -1, 2), 2L), - (Array(1, -1, 3, -1, 2), 3L), - (Array(5, -1, 6, -1, 3, -1, 2), 2L), - (Array(5, -1, 1, -1, 3, -1, 2), 2L), - (Array(1, -1, 1), 2L), - (Array(2, -1, 1), 2L), - (Array(3, -1, 1), 2L), - (Array(5, -1, 1), 2L), - (Array(2, 3, -1, 1), 2L), - (Array(1, -1, 3, -1, 1), 2L), - (Array(1, -1, 2, 3, -1, 1), 2L), - (Array(1, -1, 2, -1, 1), 2L)) + (Array(0, 1, 0), 4L), + (Array(0, 2, 0), 4L), + (Array(0, 3, 0), 4L), + (Array(0, 4, 0), 3L), + (Array(0, 5, 0), 3L), + (Array(0, 6, 0), 3L), + (Array(0, 1, 0, 6, 0), 2L), + (Array(0, 2, 0, 6, 0), 2L), + (Array(0, 5, 0, 6, 0), 2L), + (Array(0, 1, 2, 0, 6, 0), 2L), + (Array(0, 1, 0, 4, 0), 2L), + (Array(0, 2, 0, 4, 0), 2L), + (Array(0, 1, 2, 0, 4, 0), 2L), + (Array(0, 1, 0, 3, 0), 4L), + (Array(0, 2, 0, 3, 0), 3L), + (Array(0, 2, 3, 0), 2L), + (Array(0, 3, 0, 3, 0), 3L), + (Array(0, 4, 0, 3, 0), 3L), + (Array(0, 5, 0, 3, 0), 2L), + (Array(0, 6, 0, 3, 0), 2L), + (Array(0, 5, 0, 6, 0, 3, 0), 2L), + (Array(0, 6, 0, 2, 0, 3, 0), 2L), + (Array(0, 5, 0, 2, 0, 3, 0), 2L), + (Array(0, 5, 0, 1, 0, 3, 0), 2L), + (Array(0, 2, 0, 4, 0, 3, 0), 2L), + (Array(0, 1, 0, 4, 0, 3, 0), 2L), + (Array(0, 1, 2, 0, 4, 0, 3, 0), 2L), + (Array(0, 1, 0, 3, 0, 3, 0), 3L), + (Array(0, 1, 2, 0, 3, 0), 2L), + (Array(0, 1, 0, 2, 0, 3, 0), 2L), + (Array(0, 1, 0, 2, 3, 0), 2L), + (Array(0, 1, 0, 2, 0), 4L), + (Array(0, 1, 2, 0), 2L), + (Array(0, 3, 0, 2, 0), 3L), + (Array(0, 4, 0, 2, 0), 2L), + (Array(0, 5, 0, 2, 0), 2L), + (Array(0, 6, 0, 2, 0), 2L), + (Array(0, 5, 0, 6, 0, 2, 0), 2L), + (Array(0, 6, 0, 3, 0, 2, 0), 2L), + (Array(0, 5, 0, 3, 0, 2, 0), 2L), + (Array(0, 5, 0, 1, 0, 2, 0), 2L), + (Array(0, 4, 0, 3, 0, 2, 0), 2L), + (Array(0, 1, 0, 3, 0, 2, 0), 3L), + (Array(0, 5, 0, 6, 0, 3, 0, 2, 0), 2L), + (Array(0, 5, 0, 1, 0, 3, 0, 2, 0), 2L), + (Array(0, 1, 0, 1, 0), 2L), + (Array(0, 2, 0, 1, 0), 2L), + (Array(0, 3, 0, 1, 0), 2L), + (Array(0, 5, 0, 1, 0), 2L), + (Array(0, 2, 3, 0, 1, 0), 2L), + (Array(0, 1, 0, 3, 0, 1, 0), 2L), + (Array(0, 1, 0, 2, 3, 0, 1, 0), 2L), + (Array(0, 1, 0, 2, 0, 1, 0), 2L)) compareInternalResults(expectedValue, result.collect()) } + test("PrefixSpan projections with multiple partial starts") { + val sequences = Seq( + Array(Array(1, 2), Array(1, 2, 3))) + val rdd = sc.parallelize(sequences, 2) + val prefixSpan = new PrefixSpan() + .setMinSupport(1.0) + .setMaxPatternLength(2) + val model = prefixSpan.run(rdd) + val expected = Array( + (Array(Array(1)), 1L), + (Array(Array(1, 2)), 1L), + (Array(Array(1), Array(1)), 1L), + (Array(Array(1), Array(2)), 1L), + (Array(Array(1), Array(3)), 1L), + (Array(Array(1, 3)), 1L), + (Array(Array(2)), 1L), + (Array(Array(2, 3)), 1L), + (Array(Array(2), Array(1)), 1L), + (Array(Array(2), Array(2)), 1L), + (Array(Array(2), Array(3)), 1L), + (Array(Array(3)), 1L)) + compareResults(expected, model.freqSequences.collect()) + } + test("PrefixSpan Integer type, variable-size itemsets") { val sequences = Seq( Array(Array(1, 2), Array(3)), @@ -265,7 +287,7 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { Array(Array(6))) val rdd = sc.parallelize(sequences, 2).cache() - val prefixspan = new PrefixSpan() + val prefixSpan = new PrefixSpan() .setMinSupport(0.5) .setMaxPatternLength(5) @@ -296,7 +318,7 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { 5 <{1,2}> 0.75 */ - val model = prefixspan.run(rdd) + val model = prefixSpan.run(rdd) val expected = Array( (Array(Array(1)), 3L), (Array(Array(2)), 3L), @@ -304,7 +326,7 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { (Array(Array(1), Array(3)), 2L), (Array(Array(1, 2)), 3L) ) - compareResults(expected, model.freqSequences.collect().map(x => (x.sequence, x.freq))) + compareResults(expected, model.freqSequences.collect()) } test("PrefixSpan String type, variable-size itemsets") { @@ -318,11 +340,11 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { Array(Array(6))).map(seq => seq.map(itemSet => itemSet.map(intToString))) val rdd = sc.parallelize(sequences, 2).cache() - val prefixspan = new PrefixSpan() + val prefixSpan = new PrefixSpan() .setMinSupport(0.5) .setMaxPatternLength(5) - val model = prefixspan.run(rdd) + val model = prefixSpan.run(rdd) val expected = Array( (Array(Array(1)), 3L), (Array(Array(2)), 3L), @@ -332,17 +354,17 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { ).map { case (pattern, count) => (pattern.map(itemSet => itemSet.map(intToString)), count) } - compareResults(expected, model.freqSequences.collect().map(x => (x.sequence, x.freq))) + compareResults(expected, model.freqSequences.collect()) } private def compareResults[Item]( expectedValue: Array[(Array[Array[Item]], Long)], - actualValue: Array[(Array[Array[Item]], Long)]): Unit = { + actualValue: Array[PrefixSpan.FreqSequence[Item]]): Unit = { val expectedSet = expectedValue.map { case (pattern: Array[Array[Item]], count: Long) => (pattern.map(itemSet => itemSet.toSet).toSeq, count) }.toSet - val actualSet = actualValue.map { case (pattern: Array[Array[Item]], count: Long) => - (pattern.map(itemSet => itemSet.toSet).toSeq, count) + val actualSet = actualValue.map { x => + (x.sequence.map(_.toSet).toSeq, x.freq) }.toSet assert(expectedSet === actualSet) } @@ -354,11 +376,4 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { val actualSet = actualValue.map(x => (x._1.toSeq, x._2)).toSet assert(expectedSet === actualSet) } - - private def insertDelimiter(sequence: Array[Int]): Array[Int] = { - sequence.zip(Seq.fill(sequence.length)(PrefixSpan.DELIMITER)).map { case (a, b) => - List(a, b) - }.flatten - } - } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala index d119e0b50a39..8db5c8424abe 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala @@ -204,6 +204,7 @@ class BLASSuite extends SparkFunSuite { val C14 = C1.copy val C15 = C1.copy val C16 = C1.copy + val C17 = C1.copy val expected2 = new DenseMatrix(4, 2, Array(2.0, 1.0, 4.0, 2.0, 4.0, 0.0, 4.0, 3.0)) val expected3 = new DenseMatrix(4, 2, Array(2.0, 2.0, 4.0, 2.0, 8.0, 0.0, 6.0, 6.0)) val expected4 = new DenseMatrix(4, 2, Array(5.0, 0.0, 10.0, 5.0, 0.0, 0.0, 5.0, 0.0)) @@ -217,6 +218,10 @@ class BLASSuite extends SparkFunSuite { assert(C2 ~== expected2 absTol 1e-15) assert(C3 ~== expected3 absTol 1e-15) assert(C4 ~== expected3 absTol 1e-15) + gemm(1.0, dA, B, 0.0, C17) + assert(C17 ~== expected absTol 1e-15) + gemm(1.0, sA, B, 0.0, C17) + assert(C17 ~== expected absTol 1e-15) withClue("columns of A don't match the rows of B") { intercept[Exception] { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index a270ba2562db..bfd6d5495f5e 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -74,6 +74,24 @@ class MatricesSuite extends SparkFunSuite { } } + test("equals") { + val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)) + assert(dm1 === dm1) + assert(dm1 !== dm1.transpose) + + val dm2 = Matrices.dense(2, 2, Array(0.0, 2.0, 1.0, 3.0)) + assert(dm1 === dm2.transpose) + + val sm1 = dm1.asInstanceOf[DenseMatrix].toSparse + assert(sm1 === sm1) + assert(sm1 === dm1) + assert(sm1 !== sm1.transpose) + + val sm2 = dm2.asInstanceOf[DenseMatrix].toSparse + assert(sm1 === sm2.transpose) + assert(sm1 === dm2.transpose) + } + test("matrix copies are deep copies") { val m = 3 val n = 2 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 1c37ea5123e8..6508ddeba420 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -367,4 +367,11 @@ class VectorsSuite extends SparkFunSuite with Logging { val sv1c = sv1.compressed.asInstanceOf[DenseVector] assert(sv1 === sv1c) } + + test("SparseVector.slice") { + val v = new SparseVector(5, Array(1, 2, 4), Array(1.1, 2.2, 4.4)) + assert(v.slice(Array(0, 2)) === new SparseVector(2, Array(1), Array(2.2))) + assert(v.slice(Array(2, 0)) === new SparseVector(2, Array(0), Array(2.2))) + assert(v.slice(Array(2, 0, 3, 4)) === new SparseVector(4, Array(0, 3), Array(2.2, 4.4))) + } } diff --git a/network/common/pom.xml b/network/common/pom.xml index 7dc3068ab8cb..3cfa7f5a8aa3 100644 --- a/network/common/pom.xml +++ b/network/common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchFailure.java b/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchFailure.java index f76bb49e874f..f0363830b61a 100644 --- a/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchFailure.java +++ b/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchFailure.java @@ -52,6 +52,11 @@ public static ChunkFetchFailure decode(ByteBuf buf) { return new ChunkFetchFailure(streamChunkId, errorString); } + @Override + public int hashCode() { + return Objects.hashCode(streamChunkId, errorString); + } + @Override public boolean equals(Object other) { if (other instanceof ChunkFetchFailure) { diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchRequest.java b/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchRequest.java index 980947cf13f6..5a173af54f61 100644 --- a/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchRequest.java +++ b/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchRequest.java @@ -48,6 +48,11 @@ public static ChunkFetchRequest decode(ByteBuf buf) { return new ChunkFetchRequest(StreamChunkId.decode(buf)); } + @Override + public int hashCode() { + return streamChunkId.hashCode(); + } + @Override public boolean equals(Object other) { if (other instanceof ChunkFetchRequest) { diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java b/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java index ff4936470c69..c962fb7ecf76 100644 --- a/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java +++ b/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java @@ -61,6 +61,11 @@ public static ChunkFetchSuccess decode(ByteBuf buf) { return new ChunkFetchSuccess(streamChunkId, managedBuf); } + @Override + public int hashCode() { + return Objects.hashCode(streamChunkId, buffer); + } + @Override public boolean equals(Object other) { if (other instanceof ChunkFetchSuccess) { diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/RpcFailure.java b/network/common/src/main/java/org/apache/spark/network/protocol/RpcFailure.java index 6b991375fc48..2dfc7876ba32 100644 --- a/network/common/src/main/java/org/apache/spark/network/protocol/RpcFailure.java +++ b/network/common/src/main/java/org/apache/spark/network/protocol/RpcFailure.java @@ -50,6 +50,11 @@ public static RpcFailure decode(ByteBuf buf) { return new RpcFailure(requestId, errorString); } + @Override + public int hashCode() { + return Objects.hashCode(requestId, errorString); + } + @Override public boolean equals(Object other) { if (other instanceof RpcFailure) { diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/RpcRequest.java b/network/common/src/main/java/org/apache/spark/network/protocol/RpcRequest.java index cdee0b0e0316..745039db742f 100644 --- a/network/common/src/main/java/org/apache/spark/network/protocol/RpcRequest.java +++ b/network/common/src/main/java/org/apache/spark/network/protocol/RpcRequest.java @@ -59,6 +59,11 @@ public static RpcRequest decode(ByteBuf buf) { return new RpcRequest(requestId, message); } + @Override + public int hashCode() { + return Objects.hashCode(requestId, Arrays.hashCode(message)); + } + @Override public boolean equals(Object other) { if (other instanceof RpcRequest) { diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/RpcResponse.java b/network/common/src/main/java/org/apache/spark/network/protocol/RpcResponse.java index 0a62e09a8115..1671cd444f03 100644 --- a/network/common/src/main/java/org/apache/spark/network/protocol/RpcResponse.java +++ b/network/common/src/main/java/org/apache/spark/network/protocol/RpcResponse.java @@ -50,6 +50,11 @@ public static RpcResponse decode(ByteBuf buf) { return new RpcResponse(requestId, response); } + @Override + public int hashCode() { + return Objects.hashCode(requestId, Arrays.hashCode(response)); + } + @Override public boolean equals(Object other) { if (other instanceof RpcResponse) { diff --git a/network/common/src/test/java/org/apache/spark/network/TestManagedBuffer.java b/network/common/src/test/java/org/apache/spark/network/TestManagedBuffer.java index 38113a918f79..83c90f9eff2b 100644 --- a/network/common/src/test/java/org/apache/spark/network/TestManagedBuffer.java +++ b/network/common/src/test/java/org/apache/spark/network/TestManagedBuffer.java @@ -80,6 +80,11 @@ public Object convertToNetty() throws IOException { return underlying.convertToNetty(); } + @Override + public int hashCode() { + return underlying.hashCode(); + } + @Override public boolean equals(Object other) { if (other instanceof ManagedBuffer) { diff --git a/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java b/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java index be6632bb8cf4..8104004847a2 100644 --- a/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java +++ b/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java @@ -17,11 +17,11 @@ package org.apache.spark.network.sasl; -import static com.google.common.base.Charsets.UTF_8; import static org.junit.Assert.*; import static org.mockito.Mockito.*; import java.io.File; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.List; import java.util.Random; @@ -138,8 +138,8 @@ private void testBasicSasl(boolean encrypt) throws Exception { public Void answer(InvocationOnMock invocation) { byte[] message = (byte[]) invocation.getArguments()[1]; RpcResponseCallback cb = (RpcResponseCallback) invocation.getArguments()[2]; - assertEquals("Ping", new String(message, UTF_8)); - cb.onSuccess("Pong".getBytes(UTF_8)); + assertEquals("Ping", new String(message, StandardCharsets.UTF_8)); + cb.onSuccess("Pong".getBytes(StandardCharsets.UTF_8)); return null; } }) @@ -148,8 +148,9 @@ public Void answer(InvocationOnMock invocation) { SaslTestCtx ctx = new SaslTestCtx(rpcHandler, encrypt, false); try { - byte[] response = ctx.client.sendRpcSync("Ping".getBytes(UTF_8), TimeUnit.SECONDS.toMillis(10)); - assertEquals("Pong", new String(response, UTF_8)); + byte[] response = ctx.client.sendRpcSync("Ping".getBytes(StandardCharsets.UTF_8), + TimeUnit.SECONDS.toMillis(10)); + assertEquals("Pong", new String(response, StandardCharsets.UTF_8)); } finally { ctx.close(); } @@ -235,7 +236,7 @@ public void testFileRegionEncryption() throws Exception { final String blockSizeConf = "spark.network.sasl.maxEncryptedBlockSize"; System.setProperty(blockSizeConf, "1k"); - final AtomicReference response = new AtomicReference(); + final AtomicReference response = new AtomicReference<>(); final File file = File.createTempFile("sasltest", ".txt"); SaslTestCtx ctx = null; try { @@ -321,7 +322,8 @@ public void testDataEncryptionIsActuallyEnabled() throws Exception { SaslTestCtx ctx = null; try { ctx = new SaslTestCtx(mock(RpcHandler.class), true, true); - ctx.client.sendRpcSync("Ping".getBytes(UTF_8), TimeUnit.SECONDS.toMillis(10)); + ctx.client.sendRpcSync("Ping".getBytes(StandardCharsets.UTF_8), + TimeUnit.SECONDS.toMillis(10)); fail("Should have failed to send RPC to server."); } catch (Exception e) { assertFalse(e.getCause() instanceof TimeoutException); diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml index 532463e96fbb..dd004b121339 100644 --- a/network/shuffle/pom.xml +++ b/network/shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java index cca8b17c4f12..167ef3310422 100644 --- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java +++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java @@ -27,7 +27,7 @@ /** * Initial registration message between an executor and its local shuffle server. - * Returns nothing (empty bye array). + * Returns nothing (empty byte array). */ public class RegisterExecutor extends BlockTransferMessage { public final String appId; diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java index 1c28fc1dff24..94a61d6caadc 100644 --- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java +++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java @@ -23,6 +23,9 @@ import org.apache.spark.network.protocol.Encoders; import org.apache.spark.network.shuffle.protocol.BlockTransferMessage; +// Needed by ScalaDoc. See SPARK-7726 +import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type; + /** * A message sent from the driver to register with the MesosExternalShuffleService. */ diff --git a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java index 73374cdc77a2..1d197497b7c8 100644 --- a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java +++ b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java @@ -90,9 +90,11 @@ public void testOpenShuffleBlocks() { (StreamHandle) BlockTransferMessage.Decoder.fromByteArray(response.getValue()); assertEquals(2, handle.numChunks); - ArgumentCaptor stream = ArgumentCaptor.forClass(Iterator.class); + @SuppressWarnings("unchecked") + ArgumentCaptor> stream = (ArgumentCaptor>) + (ArgumentCaptor) ArgumentCaptor.forClass(Iterator.class); verify(streamManager, times(1)).registerStream(stream.capture()); - Iterator buffers = (Iterator) stream.getValue(); + Iterator buffers = stream.getValue(); assertEquals(block0Marker, buffers.next()); assertEquals(block1Marker, buffers.next()); assertFalse(buffers.hasNext()); diff --git a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java index 1ad0d72ae5ec..06e46f924109 100644 --- a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java +++ b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java @@ -20,7 +20,9 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.util.Arrays; import java.util.LinkedHashSet; +import java.util.List; import java.util.Map; import com.google.common.collect.ImmutableMap; @@ -67,13 +69,13 @@ public void afterEach() { public void testNoFailures() throws IOException { BlockFetchingListener listener = mock(BlockFetchingListener.class); - Map[] interactions = new Map[] { + List> interactions = Arrays.asList( // Immediately return both blocks successfully. ImmutableMap.builder() .put("b0", block0) .put("b1", block1) - .build(), - }; + .build() + ); performInteractions(interactions, listener); @@ -86,13 +88,13 @@ public void testNoFailures() throws IOException { public void testUnrecoverableFailure() throws IOException { BlockFetchingListener listener = mock(BlockFetchingListener.class); - Map[] interactions = new Map[] { + List> interactions = Arrays.asList( // b0 throws a non-IOException error, so it will be failed without retry. ImmutableMap.builder() .put("b0", new RuntimeException("Ouch!")) .put("b1", block1) - .build(), - }; + .build() + ); performInteractions(interactions, listener); @@ -105,7 +107,7 @@ public void testUnrecoverableFailure() throws IOException { public void testSingleIOExceptionOnFirst() throws IOException { BlockFetchingListener listener = mock(BlockFetchingListener.class); - Map[] interactions = new Map[] { + List> interactions = Arrays.asList( // IOException will cause a retry. Since b0 fails, we will retry both. ImmutableMap.builder() .put("b0", new IOException("Connection failed or something")) @@ -114,8 +116,8 @@ public void testSingleIOExceptionOnFirst() throws IOException { ImmutableMap.builder() .put("b0", block0) .put("b1", block1) - .build(), - }; + .build() + ); performInteractions(interactions, listener); @@ -128,7 +130,7 @@ public void testSingleIOExceptionOnFirst() throws IOException { public void testSingleIOExceptionOnSecond() throws IOException { BlockFetchingListener listener = mock(BlockFetchingListener.class); - Map[] interactions = new Map[] { + List> interactions = Arrays.asList( // IOException will cause a retry. Since b1 fails, we will not retry b0. ImmutableMap.builder() .put("b0", block0) @@ -136,8 +138,8 @@ public void testSingleIOExceptionOnSecond() throws IOException { .build(), ImmutableMap.builder() .put("b1", block1) - .build(), - }; + .build() + ); performInteractions(interactions, listener); @@ -150,7 +152,7 @@ public void testSingleIOExceptionOnSecond() throws IOException { public void testTwoIOExceptions() throws IOException { BlockFetchingListener listener = mock(BlockFetchingListener.class); - Map[] interactions = new Map[] { + List> interactions = Arrays.asList( // b0's IOException will trigger retry, b1's will be ignored. ImmutableMap.builder() .put("b0", new IOException()) @@ -164,8 +166,8 @@ public void testTwoIOExceptions() throws IOException { // b1 returns successfully within 2 retries. ImmutableMap.builder() .put("b1", block1) - .build(), - }; + .build() + ); performInteractions(interactions, listener); @@ -178,7 +180,7 @@ public void testTwoIOExceptions() throws IOException { public void testThreeIOExceptions() throws IOException { BlockFetchingListener listener = mock(BlockFetchingListener.class); - Map[] interactions = new Map[] { + List> interactions = Arrays.asList( // b0's IOException will trigger retry, b1's will be ignored. ImmutableMap.builder() .put("b0", new IOException()) @@ -196,8 +198,8 @@ public void testThreeIOExceptions() throws IOException { // This is not reached -- b1 has failed. ImmutableMap.builder() .put("b1", block1) - .build(), - }; + .build() + ); performInteractions(interactions, listener); @@ -210,7 +212,7 @@ public void testThreeIOExceptions() throws IOException { public void testRetryAndUnrecoverable() throws IOException { BlockFetchingListener listener = mock(BlockFetchingListener.class); - Map[] interactions = new Map[] { + List> interactions = Arrays.asList( // b0's IOException will trigger retry, subsequent messages will be ignored. ImmutableMap.builder() .put("b0", new IOException()) @@ -226,8 +228,8 @@ public void testRetryAndUnrecoverable() throws IOException { // b2 succeeds in its last retry. ImmutableMap.builder() .put("b2", block2) - .build(), - }; + .build() + ); performInteractions(interactions, listener); @@ -248,7 +250,8 @@ public void testRetryAndUnrecoverable() throws IOException { * subset of the original blocks in a second interaction. */ @SuppressWarnings("unchecked") - private void performInteractions(final Map[] interactions, BlockFetchingListener listener) + private static void performInteractions(List> interactions, + BlockFetchingListener listener) throws IOException { TransportConf conf = new TransportConf(new SystemPropertyConfigProvider()); diff --git a/network/yarn/pom.xml b/network/yarn/pom.xml index a99f7c4392d3..21a991c91dbb 100644 --- a/network/yarn/pom.xml +++ b/network/yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/pom.xml b/pom.xml index be0dac953abf..508d8e4fe0aa 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT pom Spark Project Parent POM http://spark.apache.org/ @@ -104,6 +104,7 @@ external/flume-sink external/flume-assembly external/mqtt + external/mqtt-assembly external/zeromq examples repl @@ -134,11 +135,12 @@ 2.4.0 org.spark-project.hive - 0.13.1a + 1.2.1.spark - 0.13.1 + 1.2.1 10.10.1.1 1.7.0 + 1.6.0 1.2.4 8.1.14.v20131031 3.0.0.v201112011016 @@ -151,7 +153,10 @@ 0.7.1 1.9.16 1.2.1 + 4.3.2 + + 3.1 3.4.1 2.10.4 2.10 @@ -161,6 +166,20 @@ 2.4.4 1.1.1.7 1.1.2 + 1.2.0-incubating + 1.10 + + 2.6 + + 3.3.2 + 3.2.10 + 2.7.8 + 1.9 + 2.5 + 3.5.2 + 1.3.9 + 0.9.2 + ${java.home} spring-releases Spring Release Repository https://repo.spring.io/libs-release - true + false false @@ -402,12 +429,17 @@ org.apache.commons commons-lang3 - 3.3.2 + ${commons-lang3.version} + + + commons-lang + commons-lang + ${commons-lang2.version} commons-codec commons-codec - 1.10 + ${commons-codec.version} org.apache.commons @@ -422,7 +454,12 @@ com.google.code.findbugs jsr305 - 1.3.9 + ${jsr305.version} + + + commons-httpclient + commons-httpclient + ${httpclient.classic.version} org.apache.httpcomponents @@ -439,6 +476,16 @@ selenium-java 2.42.2 test + + + com.google.guava + guava + + + io.netty + netty + + @@ -624,15 +671,26 @@ com.sun.jersey jersey-server - 1.9 + ${jersey.version} ${hadoop.deps.scope} com.sun.jersey jersey-core - 1.9 + ${jersey.version} ${hadoop.deps.scope} + + com.sun.jersey + jersey-json + ${jersey.version} + + + stax + stax-api + + + org.scala-lang scala-compiler @@ -1022,58 +1080,499 @@ hive-beeline ${hive.version} ${hive.deps.scope} + + + ${hive.group} + hive-common + + + ${hive.group} + hive-exec + + + ${hive.group} + hive-jdbc + + + ${hive.group} + hive-metastore + + + ${hive.group} + hive-service + + + ${hive.group} + hive-shims + + + org.apache.thrift + libthrift + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + commons-logging + commons-logging + + ${hive.group} hive-cli ${hive.version} ${hive.deps.scope} + + + ${hive.group} + hive-common + + + ${hive.group} + hive-exec + + + ${hive.group} + hive-jdbc + + + ${hive.group} + hive-metastore + + + ${hive.group} + hive-serde + + + ${hive.group} + hive-service + + + ${hive.group} + hive-shims + + + org.apache.thrift + libthrift + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + commons-logging + commons-logging + + ${hive.group} - hive-exec + hive-common ${hive.version} ${hive.deps.scope} + + ${hive.group} + hive-shims + + + org.apache.ant + ant + + + org.apache.zookeeper + zookeeper + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + commons-logging commons-logging + + + + + ${hive.group} + hive-exec + + ${hive.version} + ${hive.deps.scope} + + + + + ${hive.group} + hive-metastore + + + ${hive.group} + hive-shims + + + ${hive.group} + hive-ant + + + + ${hive.group} + spark-client + + + + + ant + ant + + + org.apache.ant + ant + com.esotericsoftware.kryo kryo + + commons-codec + commons-codec + + + commons-httpclient + commons-httpclient + org.apache.avro avro-mapred + + + org.apache.calcite + calcite-core + + + org.apache.curator + apache-curator + + + org.apache.curator + curator-client + + + org.apache.curator + curator-framework + + + org.apache.thrift + libthrift + + + org.apache.thrift + libfb303 + + + org.apache.zookeeper + zookeeper + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + commons-logging + commons-logging + ${hive.group} hive-jdbc ${hive.version} - ${hive.deps.scope} + + + ${hive.group} + hive-common + + + ${hive.group} + hive-common + + + ${hive.group} + hive-metastore + + + ${hive.group} + hive-serde + + + ${hive.group} + hive-service + + + ${hive.group} + hive-shims + + + org.apache.httpcomponents + httpclient + + + org.apache.httpcomponents + httpcore + + + org.apache.curator + curator-framework + + + org.apache.thrift + libthrift + + + org.apache.thrift + libfb303 + + + org.apache.zookeeper + zookeeper + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + commons-logging + commons-logging + + + ${hive.group} hive-metastore ${hive.version} ${hive.deps.scope} + + + ${hive.group} + hive-serde + + + ${hive.group} + hive-shims + + + org.apache.thrift + libfb303 + + + org.apache.thrift + libthrift + + + com.google.guava + guava + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + ${hive.group} hive-serde ${hive.version} ${hive.deps.scope} + + ${hive.group} + hive-common + + + ${hive.group} + hive-shims + + + commons-codec + commons-codec + + + com.google.code.findbugs + jsr305 + + + org.apache.avro + avro + + + org.apache.thrift + libthrift + + + org.apache.thrift + libfb303 + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + commons-logging commons-logging + + + + + ${hive.group} + hive-service + ${hive.version} + ${hive.deps.scope} + + + ${hive.group} + hive-common + + + ${hive.group} + hive-exec + + + ${hive.group} + hive-metastore + + + ${hive.group} + hive-shims + + + commons-codec + commons-codec + + + org.apache.curator + curator-framework + + + org.apache.curator + curator-recipes + + + org.apache.thrift + libfb303 + + + org.apache.thrift + libthrift + + + + + + + ${hive.group} + hive-shims + ${hive.version} + ${hive.deps.scope} + + + com.google.guava + guava + + + org.apache.hadoop + hadoop-yarn-server-resourcemanager + + + org.apache.curator + curator-framework + + + org.apache.thrift + libthrift + + + org.apache.zookeeper + zookeeper + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + commons-logging - commons-logging-api + commons-logging @@ -1095,6 +1594,12 @@ ${parquet.version} ${parquet.test.deps.scope} + + com.twitter + parquet-hadoop-bundle + ${hive.parquet.version} + compile + org.apache.flume flume-ng-core @@ -1135,6 +1640,125 @@ + + org.apache.calcite + calcite-core + ${calcite.version} + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + com.google.guava + guava + + + com.google.code.findbugs + jsr305 + + + org.codehaus.janino + janino + + + + org.hsqldb + hsqldb + + + org.pentaho + pentaho-aggdesigner-algorithm + + + + + org.apache.calcite + calcite-avatica + ${calcite.version} + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + + + org.codehaus.janino + janino + ${janino.version} + + + joda-time + joda-time + ${joda.version} + + + org.jodd + jodd-core + ${jodd.version} + + + org.datanucleus + datanucleus-core + ${datanucleus-core.version} + + + org.apache.thrift + libthrift + ${libthrift.version} + + + org.apache.httpcomponents + httpclient + + + org.apache.httpcomponents + httpcore + + + org.slf4j + slf4j-api + + + + + org.apache.thrift + libfb303 + ${libthrift.version} + + + org.apache.httpcomponents + httpclient + + + org.apache.httpcomponents + httpcore + + + org.slf4j + slf4j-api + + + @@ -1223,6 +1847,7 @@ ${java.version} -target ${java.version} + -Xlint:all,-serial,-path @@ -1236,6 +1861,9 @@ UTF-8 1024m true + + -Xlint:all,-serial,-path + @@ -1267,10 +1895,13 @@ ${project.build.directory}/tmp ${spark.test.home} 1 + false false false true true + + src false @@ -1305,6 +1936,8 @@ false true true + + __not_used__ @@ -1748,44 +2381,6 @@ - - mapr3 - - 1.0.3-mapr-3.0.3 - 2.4.1-mapr-1408 - 0.98.4-mapr-1408 - 3.4.5-mapr-1406 - - - - - mapr4 - - 2.4.1-mapr-1408 - 2.4.1-mapr-1408 - 0.98.4-mapr-1408 - 3.4.5-mapr-1406 - - - - org.apache.curator - curator-recipes - ${curator.version} - - - org.apache.zookeeper - zookeeper - - - - - org.apache.zookeeper - zookeeper - 3.4.5-mapr-1406 - - - - hive-thriftserver diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 280aac931915..88745dc086a0 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -62,8 +62,6 @@ object MimaExcludes { "org.apache.spark.ml.classification.LogisticCostFun.this"), // SQL execution is considered private. excludePackage("org.apache.spark.sql.execution"), - // Parquet support is considered private. - excludePackage("org.apache.spark.sql.parquet"), // The old JSON RDD is removed in favor of streaming Jackson ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD$"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD"), @@ -155,11 +153,45 @@ object MimaExcludes { ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD$"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec$"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand"), - ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLException") + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLException"), + // SPARK-9763 Minimize exposure of internal SQL classes + excludePackage("org.apache.spark.sql.parquet"), + excludePackage("org.apache.spark.sql.json"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$DecimalConversion$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartition"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JdbcUtils$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$DecimalConversion"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartitioningInfo$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartition$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$JDBCConversion"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package$DriverWrapper"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartitioningInfo"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JdbcUtils"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DefaultSource"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRelation$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRelation") ) ++ Seq( // SPARK-4751 Dynamic allocation for standalone mode ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.SparkContext.supportDynamicAllocation") + ) ++ Seq( + // SPARK-9580: Remove SQL test singletons + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.test.LocalSQLContext$SQLSession"), + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.test.LocalSQLContext"), + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.test.TestSQLContext"), + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.test.TestSQLContext$") + ) ++ Seq( + // SPARK-9704 Made ProbabilisticClassifier, Identifiable, VectorUDT public APIs + ProblemFilters.exclude[IncompatibleResultTypeProblem]( + "org.apache.spark.mllib.linalg.VectorUDT.serialize") ) case v if v.startsWith("1.4") => @@ -182,7 +214,7 @@ object MimaExcludes { ProblemFilters.exclude[IncompatibleResultTypeProblem]( "org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast"), ProblemFilters.exclude[MissingClassProblem]( - "org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorActor") + "org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint") ) ++ Seq( // SPARK-4655 - Making Stage an Abstract class broke binary compatility even though // the stage class is defined as private[spark] diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 9a33baa7c6ce..04e0d49b178c 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -42,11 +42,11 @@ object BuildCommons { "streaming-zeromq", "launcher", "unsafe").map(ProjectRef(buildLocation, _)) val optionallyEnabledProjects@Seq(yarn, yarnStable, java8Tests, sparkGangliaLgpl, - sparkKinesisAsl) = Seq("yarn", "yarn-stable", "java8-tests", "ganglia-lgpl", - "kinesis-asl").map(ProjectRef(buildLocation, _)) + streamingKinesisAsl) = Seq("yarn", "yarn-stable", "java8-tests", "ganglia-lgpl", + "streaming-kinesis-asl").map(ProjectRef(buildLocation, _)) - val assemblyProjects@Seq(assembly, examples, networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKinesisAslAssembly) = - Seq("assembly", "examples", "network-yarn", "streaming-flume-assembly", "streaming-kafka-assembly", "streaming-kinesis-asl-assembly") + val assemblyProjects@Seq(assembly, examples, networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingMqttAssembly, streamingKinesisAslAssembly) = + Seq("assembly", "examples", "network-yarn", "streaming-flume-assembly", "streaming-kafka-assembly", "streaming-mqtt-assembly", "streaming-kinesis-asl-assembly") .map(ProjectRef(buildLocation, _)) val tools = ProjectRef(buildLocation, "tools") @@ -212,6 +212,9 @@ object SparkBuild extends PomBuild { /* Enable Assembly for all assembly projects */ assemblyProjects.foreach(enable(Assembly.settings)) + /* Enable Assembly for streamingMqtt test */ + enable(inConfig(Test)(Assembly.settings))(streamingMqtt) + /* Package pyspark artifacts in a separate zip file for YARN. */ enable(PySparkAssembly.settings)(assembly) @@ -316,6 +319,8 @@ object SQL { lazy val settings = Seq( initialCommands in console := """ + |import org.apache.spark.SparkContext + |import org.apache.spark.sql.SQLContext |import org.apache.spark.sql.catalyst.analysis._ |import org.apache.spark.sql.catalyst.dsl._ |import org.apache.spark.sql.catalyst.errors._ @@ -325,9 +330,14 @@ object SQL { |import org.apache.spark.sql.catalyst.util._ |import org.apache.spark.sql.execution |import org.apache.spark.sql.functions._ - |import org.apache.spark.sql.test.TestSQLContext._ - |import org.apache.spark.sql.types._""".stripMargin, - cleanupCommands in console := "sparkContext.stop()" + |import org.apache.spark.sql.types._ + | + |val sc = new SparkContext("local[*]", "dev-shell") + |val sqlContext = new SQLContext(sc) + |import sqlContext.implicits._ + |import sqlContext._ + """.stripMargin, + cleanupCommands in console := "sc.stop()" ) } @@ -337,8 +347,6 @@ object Hive { javaOptions += "-XX:MaxPermSize=256m", // Specially disable assertions since some Hive tests fail them javaOptions in Test := (javaOptions in Test).value.filterNot(_ == "-ea"), - // Multiple queries rely on the TestHive singleton. See comments there for more details. - parallelExecution in Test := false, // Supporting all SerDes requires us to depend on deprecated APIs, so we turn off the warnings // only for this subproject. scalacOptions <<= scalacOptions map { currentOpts: Seq[String] => @@ -346,6 +354,7 @@ object Hive { }, initialCommands in console := """ + |import org.apache.spark.SparkContext |import org.apache.spark.sql.catalyst.analysis._ |import org.apache.spark.sql.catalyst.dsl._ |import org.apache.spark.sql.catalyst.errors._ @@ -382,13 +391,16 @@ object Assembly { .getOrElse(SbtPomKeys.effectivePom.value.getProperties.get("hadoop.version").asInstanceOf[String]) }, jarName in assembly <<= (version, moduleName, hadoopVersion) map { (v, mName, hv) => - if (mName.contains("streaming-flume-assembly") || mName.contains("streaming-kafka-assembly") || mName.contains("streaming-kinesis-asl-assembly")) { + if (mName.contains("streaming-flume-assembly") || mName.contains("streaming-kafka-assembly") || mName.contains("streaming-mqtt-assembly") || mName.contains("streaming-kinesis-asl-assembly")) { // This must match the same name used in maven (see external/kafka-assembly/pom.xml) s"${mName}-${v}.jar" } else { s"${mName}-${v}-hadoop${hv}.jar" } }, + jarName in (Test, assembly) <<= (version, moduleName, hadoopVersion) map { (v, mName, hv) => + s"${mName}-test-${v}.jar" + }, mergeStrategy in assembly := { case PathList("org", "datanucleus", xs @ _*) => MergeStrategy.discard case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard @@ -540,6 +552,7 @@ object TestSettings { javaOptions in Test += "-Dspark.test.home=" + sparkHome, javaOptions in Test += "-Dspark.testing=1", javaOptions in Test += "-Dspark.port.maxRetries=100", + javaOptions in Test += "-Dspark.master.rest.enabled=false", javaOptions in Test += "-Dspark.ui.enabled=false", javaOptions in Test += "-Dspark.ui.showConsoleProgress=false", javaOptions in Test += "-Dspark.driver.allowMultipleContexts=true", diff --git a/project/plugins.sbt b/project/plugins.sbt index 51820460ca1a..c06687d8f197 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,5 +1,3 @@ -scalaVersion := "2.10.4" - resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" diff --git a/python/docs/index.rst b/python/docs/index.rst index f7eede9c3c82..306ffdb0e0f1 100644 --- a/python/docs/index.rst +++ b/python/docs/index.rst @@ -29,6 +29,14 @@ Core classes: A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. + :class:`pyspark.streaming.StreamingContext` + + Main entry point for Spark Streaming functionality. + + :class:`pyspark.streaming.DStream` + + A Discretized Stream (DStream), the basic abstraction in Spark Streaming. + :class:`pyspark.sql.SQLContext` Main entry point for DataFrame and SQL functionality. diff --git a/python/docs/pyspark.mllib.rst b/python/docs/pyspark.mllib.rst index 26ece4c2c389..2d54ab118b94 100644 --- a/python/docs/pyspark.mllib.rst +++ b/python/docs/pyspark.mllib.rst @@ -46,6 +46,14 @@ pyspark.mllib.linalg module :undoc-members: :show-inheritance: +pyspark.mllib.linalg.distributed module +--------------------------------------- + +.. automodule:: pyspark.mllib.linalg.distributed + :members: + :undoc-members: + :show-inheritance: + pyspark.mllib.random module --------------------------- diff --git a/python/docs/pyspark.streaming.rst b/python/docs/pyspark.streaming.rst index 50822c93faba..fc52a647543e 100644 --- a/python/docs/pyspark.streaming.rst +++ b/python/docs/pyspark.streaming.rst @@ -15,3 +15,24 @@ pyspark.streaming.kafka module :members: :undoc-members: :show-inheritance: + +pyspark.streaming.kinesis module +-------------------------------- +.. automodule:: pyspark.streaming.kinesis + :members: + :undoc-members: + :show-inheritance: + +pyspark.streaming.flume.module +------------------------------ +.. automodule:: pyspark.streaming.flume + :members: + :undoc-members: + :show-inheritance: + +pyspark.streaming.mqtt module +----------------------------- +.. automodule:: pyspark.streaming.mqtt + :members: + :undoc-members: + :show-inheritance: diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py index 3b647985801b..95b3abc74244 100644 --- a/python/pyspark/cloudpickle.py +++ b/python/pyspark/cloudpickle.py @@ -350,6 +350,11 @@ def save_global(self, obj, name=None, pack=struct.pack): if new_override: d['__new__'] = obj.__new__ + # workaround for namedtuple (hijacked by PySpark) + if getattr(obj, '_is_namedtuple_', False): + self.save_reduce(_load_namedtuple, (obj.__name__, obj._fields)) + return + self.save(_load_class) self.save_reduce(typ, (obj.__name__, obj.__bases__, {"__doc__": obj.__doc__}), obj=obj) d.pop('__doc__', None) @@ -382,7 +387,7 @@ def save_instancemethod(self, obj): self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj) else: self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__), - obj=obj) + obj=obj) dispatch[types.MethodType] = save_instancemethod def save_inst(self, obj): @@ -744,6 +749,14 @@ def _load_class(cls, d): return cls +def _load_namedtuple(name, fields): + """ + Loads a class generated by namedtuple + """ + from collections import namedtuple + return namedtuple(name, fields) + + """Constructors for 3rd party libraries Note: These can never be renamed due to client compatibility issues""" diff --git a/python/pyspark/context.py b/python/pyspark/context.py index eb5b0bbbdac4..1b2a52ad6411 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -302,10 +302,10 @@ def applicationId(self): """ A unique identifier for the Spark application. Its format depends on the scheduler implementation. - (i.e. - in case of local spark app something like 'local-1433865536131' - in case of YARN something like 'application_1433865536131_34483' - ) + + * in case of local spark app something like 'local-1433865536131' + * in case of YARN something like 'application_1433865536131_34483' + >>> sc.applicationId # doctest: +ELLIPSIS u'local-...' """ diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py index 60be85e53e2a..cd4c55f79f18 100644 --- a/python/pyspark/java_gateway.py +++ b/python/pyspark/java_gateway.py @@ -54,7 +54,6 @@ def launch_gateway(): if os.environ.get("SPARK_TESTING"): submit_args = ' '.join([ "--conf spark.ui.enabled=false", - "--conf spark.buffer.pageSize=4mb", submit_args ]) command = [os.path.join(SPARK_HOME, script)] + shlex.split(submit_args) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index b5814f76de00..83f808efc3bf 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -34,6 +34,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol): """ Logistic regression. + Currently, this class only supports binary classification. >>> from pyspark.sql import Row >>> from pyspark.mllib.linalg import Vectors @@ -69,17 +70,27 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " + "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.") fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.") + thresholds = Param(Params._dummy(), "thresholds", + "Thresholds in multi-class classification" + + " to adjust the probability of predicting each class." + + " Array must have length equal to the number of classes, with values >= 0." + + " The class with largest value p/t is predicted, where p is the original" + + " probability of that class and t is the class' threshold.") threshold = Param(Params._dummy(), "threshold", - "threshold in binary classification prediction, in range [0, 1].") + "Threshold in binary classification prediction, in range [0, 1]." + + " If threshold and thresholds are both set, they must match.") @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, - threshold=0.5, probabilityCol="probability", rawPredictionCol="rawPrediction"): + threshold=0.5, thresholds=None, + probabilityCol="probability", rawPredictionCol="rawPrediction"): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ - threshold=0.5, probabilityCol="probability", rawPredictionCol="rawPrediction") + threshold=0.5, thresholds=None, \ + probabilityCol="probability", rawPredictionCol="rawPrediction") + If the threshold and thresholds Params are both set, they must be equivalent. """ super(LogisticRegression, self).__init__() self._java_obj = self._new_java_obj( @@ -88,30 +99,45 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred # is an L2 penalty. For alpha = 1, it is an L1 penalty. self.elasticNetParam = \ Param(self, "elasticNetParam", - "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty " + - "is an L2 penalty. For alpha = 1, it is an L1 penalty.") + "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " + + "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.") #: param for whether to fit an intercept term. self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.") - #: param for threshold in binary classification prediction, in range [0, 1]. + #: param for threshold in binary classification, in range [0, 1]. self.threshold = Param(self, "threshold", - "threshold in binary classification prediction, in range [0, 1].") + "Threshold in binary classification prediction, in range [0, 1]." + + " If threshold and thresholds are both set, they must match.") + #: param for thresholds or cutoffs in binary or multiclass classification + self.thresholds = \ + Param(self, "thresholds", + "Thresholds in multi-class classification" + + " to adjust the probability of predicting each class." + + " Array must have length equal to the number of classes, with values >= 0." + + " The class with largest value p/t is predicted, where p is the original" + + " probability of that class and t is the class' threshold.") self._setDefault(maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1E-6, fitIntercept=True, threshold=0.5) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) + self._checkThresholdConsistency() @keyword_only def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, - threshold=0.5, probabilityCol="probability", rawPredictionCol="rawPrediction"): + threshold=0.5, thresholds=None, + probabilityCol="probability", rawPredictionCol="rawPrediction"): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ - threshold=0.5, probabilityCol="probability", rawPredictionCol="rawPrediction") + threshold=0.5, thresholds=None, \ + probabilityCol="probability", rawPredictionCol="rawPrediction") Sets params for logistic regression. + If the threshold and thresholds Params are both set, they must be equivalent. """ kwargs = self.setParams._input_kwargs - return self._set(**kwargs) + self._set(**kwargs) + self._checkThresholdConsistency() + return self def _create_model(self, java_model): return LogisticRegressionModel(java_model) @@ -145,15 +171,64 @@ def getFitIntercept(self): def setThreshold(self, value): """ Sets the value of :py:attr:`threshold`. + Clears value of :py:attr:`thresholds` if it has been set. """ self._paramMap[self.threshold] = value + if self.isSet(self.thresholds): + del self._paramMap[self.thresholds] return self def getThreshold(self): """ Gets the value of threshold or its default value. """ - return self.getOrDefault(self.threshold) + self._checkThresholdConsistency() + if self.isSet(self.thresholds): + ts = self.getOrDefault(self.thresholds) + if len(ts) != 2: + raise ValueError("Logistic Regression getThreshold only applies to" + + " binary classification, but thresholds has length != 2." + + " thresholds: " + ",".join(ts)) + return 1.0/(1.0 + ts[0]/ts[1]) + else: + return self.getOrDefault(self.threshold) + + def setThresholds(self, value): + """ + Sets the value of :py:attr:`thresholds`. + Clears value of :py:attr:`threshold` if it has been set. + """ + self._paramMap[self.thresholds] = value + if self.isSet(self.threshold): + del self._paramMap[self.threshold] + return self + + def getThresholds(self): + """ + If :py:attr:`thresholds` is set, return its value. + Otherwise, if :py:attr:`threshold` is set, return the equivalent thresholds for binary + classification: (1-threshold, threshold). + If neither are set, throw an error. + """ + self._checkThresholdConsistency() + if not self.isSet(self.thresholds) and self.isSet(self.threshold): + t = self.getOrDefault(self.threshold) + return [1.0-t, t] + else: + return self.getOrDefault(self.thresholds) + + def _checkThresholdConsistency(self): + if self.isSet(self.threshold) and self.isSet(self.thresholds): + ts = self.getParam(self.thresholds) + if len(ts) != 2: + raise ValueError("Logistic Regression getThreshold only applies to" + + " binary classification, but thresholds has length != 2." + + " thresholds: " + ",".join(ts)) + t = 1.0/(1.0 + ts[0]/ts[1]) + t2 = self.getParam(self.threshold) + if abs(t2 - t) >= 1E-5: + raise ValueError("Logistic Regression getThreshold found inconsistent values for" + + " threshold (%g) and thresholds (equivalent to %g)" % (t2, t)) class LogisticRegressionModel(JavaModel): @@ -299,6 +374,7 @@ class DecisionTreeClassificationModel(DecisionTreeModel): @inherit_doc class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed, + HasRawPredictionCol, HasProbabilityCol, DecisionTreeParams, HasCheckpointInterval): """ `http://en.wikipedia.org/wiki/Random_forest Random Forest` @@ -306,6 +382,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred It supports both binary and multiclass labels, as well as both continuous and categorical features. + >>> import numpy >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer @@ -320,8 +397,13 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> allclose(model.treeWeights, [1.0, 1.0, 1.0]) True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) - >>> model.transform(test0).head().prediction + >>> result = model.transform(test0).head() + >>> result.prediction 0.0 + >>> numpy.argmax(result.probability) + 0 + >>> numpy.argmax(result.rawPrediction) + 0 >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) >>> model.transform(test1).head().prediction 1.0 @@ -342,11 +424,13 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", + probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto", seed=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ + probabilityCol="probability", rawPredictionCol="rawPrediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ numTrees=20, featureSubsetStrategy="auto", seed=None) @@ -379,11 +463,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred @keyword_only def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", + probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, impurity="gini", numTrees=20, featureSubsetStrategy="auto"): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ + probabilityCol="probability", rawPredictionCol="rawPrediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \ impurity="gini", numTrees=20, featureSubsetStrategy="auto") @@ -597,6 +683,13 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H HasRawPredictionCol): """ Naive Bayes Classifiers. + It supports both Multinomial and Bernoulli NB. Multinomial NB + (`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`) + can handle finitely supported discrete data. For example, by converting documents into + TF-IDF vectors, it can be used for document classification. By making every vector a + binary (0/1) data, it can also be used as Bernoulli NB + (`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`). + The input feature values must be nonnegative. >>> from pyspark.sql import Row >>> from pyspark.mllib.linalg import Vectors diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index b5e9b6549d9f..cb4c16e25a7a 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -19,7 +19,6 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel from pyspark.ml.param.shared import * from pyspark.mllib.common import inherit_doc -from pyspark.mllib.linalg import _convert_to_vector __all__ = ['KMeans', 'KMeansModel'] @@ -35,15 +34,17 @@ def clusterCenters(self): @inherit_doc -class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed): +class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed): """ - K-means Clustering + K-means clustering with support for multiple parallel runs and a k-means++ like initialization + mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested, + they are executed together with joint passes over the data for efficiency. >>> from pyspark.mllib.linalg import Vectors >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), ... (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] >>> df = sqlContext.createDataFrame(data, ["features"]) - >>> kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol("features") + >>> kmeans = KMeans(k=2, seed=1) >>> model = kmeans.fit(df) >>> centers = model.clusterCenters() >>> len(centers) @@ -58,10 +59,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed): # a placeholder to make it appear in the generated doc k = Param(Params._dummy(), "k", "number of clusters to create") - epsilon = Param(Params._dummy(), "epsilon", - "distance threshold within which " + - "we've consider centers to have converged") - runs = Param(Params._dummy(), "runs", "number of runs of the algorithm to execute in parallel") initMode = Param(Params._dummy(), "initMode", "the initialization algorithm. This can be either \"random\" to " + "choose random points as initial cluster centers, or \"k-means||\" " + @@ -69,21 +66,21 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed): initSteps = Param(Params._dummy(), "initSteps", "steps for k-means initialization mode") @keyword_only - def __init__(self, k=2, maxIter=20, runs=1, epsilon=1e-4, initMode="k-means||", initStep=5): + def __init__(self, featuresCol="features", predictionCol="prediction", k=2, + initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None): + """ + __init__(self, featuresCol="features", predictionCol="prediction", k=2, \ + initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None) + """ super(KMeans, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid) self.k = Param(self, "k", "number of clusters to create") - self.epsilon = Param(self, "epsilon", - "distance threshold within which " + - "we've consider centers to have converged") - self.runs = Param(self, "runs", "number of runs of the algorithm to execute in parallel") - self.seed = Param(self, "seed", "random seed") self.initMode = Param(self, "initMode", "the initialization algorithm. This can be either \"random\" to " + "choose random points as initial cluster centers, or \"k-means||\" " + "to use a parallel variant of k-means++") self.initSteps = Param(self, "initSteps", "steps for k-means initialization mode") - self._setDefault(k=2, maxIter=20, runs=1, epsilon=1e-4, initMode="k-means||", initSteps=5) + self._setDefault(k=2, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -91,9 +88,11 @@ def _create_model(self, java_model): return KMeansModel(java_model) @keyword_only - def setParams(self, k=2, maxIter=20, runs=1, epsilon=1e-4, initMode="k-means||", initSteps=5): + def setParams(self, featuresCol="features", predictionCol="prediction", k=2, + initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None): """ - setParams(self, k=2, maxIter=20, runs=1, epsilon=1e-4, initMode="k-means||", initSteps=5): + setParams(self, featuresCol="features", predictionCol="prediction", k=2, \ + initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None) Sets params for KMeans. """ @@ -117,40 +116,6 @@ def getK(self): """ return self.getOrDefault(self.k) - def setEpsilon(self, value): - """ - Sets the value of :py:attr:`epsilon`. - - >>> algo = KMeans().setEpsilon(1e-5) - >>> abs(algo.getEpsilon() - 1e-5) < 1e-5 - True - """ - self._paramMap[self.epsilon] = value - return self - - def getEpsilon(self): - """ - Gets the value of `epsilon` - """ - return self.getOrDefault(self.epsilon) - - def setRuns(self, value): - """ - Sets the value of :py:attr:`runs`. - - >>> algo = KMeans().setRuns(10) - >>> algo.getRuns() - 10 - """ - self._paramMap[self.runs] = value - return self - - def getRuns(self): - """ - Gets the value of `runs` - """ - return self.getOrDefault(self.runs) - def setInitMode(self, value): """ Sets the value of :py:attr:`initMode`. diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index 06e809352225..cb3b07947e48 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -23,7 +23,8 @@ from pyspark.ml.util import keyword_only from pyspark.mllib.common import inherit_doc -__all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator'] +__all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator', + 'MulticlassClassificationEvaluator'] @inherit_doc @@ -45,7 +46,7 @@ def _evaluate(self, dataset): """ raise NotImplementedError() - def evaluate(self, dataset, params={}): + def evaluate(self, dataset, params=None): """ Evaluates the output with optional parameters. @@ -55,6 +56,8 @@ def evaluate(self, dataset, params={}): params :return: metric """ + if params is None: + params = dict() if isinstance(params, dict): if params: return self.copy(params)._evaluate(dataset) @@ -63,6 +66,14 @@ def evaluate(self, dataset, params={}): else: raise ValueError("Params must be a param map but got %s." % type(params)) + def isLargerBetter(self): + """ + Indicates whether the metric returned by :py:meth:`evaluate` should be maximized + (True, default) or minimized (False). + A given evaluator may support multiple metrics which may be maximized or minimized. + """ + return True + @inherit_doc class JavaEvaluator(Evaluator, JavaWrapper): @@ -82,6 +93,10 @@ def _evaluate(self, dataset): self._transfer_params_to_java() return self._java_obj.evaluate(dataset._jdf) + def isLargerBetter(self): + self._transfer_params_to_java() + return self._java_obj.isLargerBetter() + @inherit_doc class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol): @@ -160,11 +175,11 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol): ... >>> evaluator = RegressionEvaluator(predictionCol="raw") >>> evaluator.evaluate(dataset) - -2.842... + 2.842... >>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"}) 0.993... >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"}) - -2.649... + 2.649... """ # Because we will maximize evaluation value (ref: `CrossValidator`), # when we evaluate a metric that is needed to minimize (e.g., `"rmse"`, `"mse"`, `"mae"`), diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 015e7a9d4900..04b2b2ccc9e5 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -15,16 +15,22 @@ # limitations under the License. # +import sys +if sys.version > '3': + basestring = str + from pyspark.rdd import ignore_unicode_prefix from pyspark.ml.param.shared import * from pyspark.ml.util import keyword_only from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer from pyspark.mllib.common import inherit_doc +from pyspark.mllib.linalg import _convert_to_vector -__all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder', - 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', - 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', - 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel'] +__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', + 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', + 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', + 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', + 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel'] @inherit_doc @@ -160,6 +166,63 @@ def getSplits(self): return self.getOrDefault(self.splits) +@inherit_doc +class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol): + """ + Outputs the Hadamard product (i.e., the element-wise product) of each input vector + with a provided "weight" vector. In other words, it scales each column of the dataset + by a scalar multiplier. + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"]) + >>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), + ... inputCol="values", outputCol="eprod") + >>> ep.transform(df).head().eprod + DenseVector([2.0, 2.0, 9.0]) + >>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod + DenseVector([4.0, 3.0, 15.0]) + """ + + # a placeholder to make it appear in the generated doc + scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " + + "it must be MLlib Vector type.") + + @keyword_only + def __init__(self, scalingVec=None, inputCol=None, outputCol=None): + """ + __init__(self, scalingVec=None, inputCol=None, outputCol=None) + """ + super(ElementwiseProduct, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct", + self.uid) + self.scalingVec = Param(self, "scalingVec", "vector for hadamard product, " + + "it must be MLlib Vector type.") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, scalingVec=None, inputCol=None, outputCol=None): + """ + setParams(self, scalingVec=None, inputCol=None, outputCol=None) + Sets params for this ElementwiseProduct. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setScalingVec(self, value): + """ + Sets the value of :py:attr:`scalingVec`. + """ + self._paramMap[self.scalingVec] = value + return self + + def getScalingVec(self): + """ + Gets the value of scalingVec or its default value. + """ + return self.getOrDefault(self.scalingVec) + + @inherit_doc class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures): """ @@ -954,6 +1017,23 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has >>> sent = ("a b " * 100 + "a c " * 10).split(" ") >>> doc = sqlContext.createDataFrame([(sent,), (sent,)], ["sentence"]) >>> model = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model").fit(doc) + >>> model.getVectors().show() + +----+--------------------+ + |word| vector| + +----+--------------------+ + | a|[-0.3511952459812...| + | b|[0.29077222943305...| + | c|[0.02315592765808...| + +----+--------------------+ + ... + >>> model.findSynonyms("a", 2).show() + +----+-------------------+ + |word| similarity| + +----+-------------------+ + | b|0.29255685145799626| + | c|-0.5414068302988307| + +----+-------------------+ + ... >>> model.transform(doc).head().model DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276]) """ @@ -1047,6 +1127,24 @@ class Word2VecModel(JavaModel): Model fitted by Word2Vec. """ + def getVectors(self): + """ + Returns the vector representation of the words as a dataframe + with two fields, word and vector. + """ + return self._call_java("getVectors") + + def findSynonyms(self, word, num): + """ + Find "num" number of words closest in similarity to "word". + word can be a string or vector representation. + Returns a dataframe with two fields word and similarity (which + gives the cosine similarity). + """ + if not isinstance(word, basestring): + word = _convert_to_vector(word) + return self._call_java("findSynonyms", word, num) + @inherit_doc class PCA(JavaEstimator, HasInputCol, HasOutputCol): @@ -1110,6 +1208,89 @@ class PCAModel(JavaModel): """ +@inherit_doc +class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol): + """ + .. note:: Experimental + + Implements the transforms required for fitting a dataset against an + R model formula. Currently we support a limited subset of the R + operators, including '~', '+', '-', and '.'. Also see the R formula + docs: + http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html + + >>> df = sqlContext.createDataFrame([ + ... (1.0, 1.0, "a"), + ... (0.0, 2.0, "b"), + ... (0.0, 0.0, "a") + ... ], ["y", "x", "s"]) + >>> rf = RFormula(formula="y ~ x + s") + >>> rf.fit(df).transform(df).show() + +---+---+---+---------+-----+ + | y| x| s| features|label| + +---+---+---+---------+-----+ + |1.0|1.0| a|[1.0,1.0]| 1.0| + |0.0|2.0| b|[2.0,0.0]| 0.0| + |0.0|0.0| a|[0.0,1.0]| 0.0| + +---+---+---+---------+-----+ + ... + >>> rf.fit(df, {rf.formula: "y ~ . - s"}).transform(df).show() + +---+---+---+--------+-----+ + | y| x| s|features|label| + +---+---+---+--------+-----+ + |1.0|1.0| a| [1.0]| 1.0| + |0.0|2.0| b| [2.0]| 0.0| + |0.0|0.0| a| [0.0]| 0.0| + +---+---+---+--------+-----+ + ... + """ + + # a placeholder to make it appear in the generated doc + formula = Param(Params._dummy(), "formula", "R model formula") + + @keyword_only + def __init__(self, formula=None, featuresCol="features", labelCol="label"): + """ + __init__(self, formula=None, featuresCol="features", labelCol="label") + """ + super(RFormula, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid) + self.formula = Param(self, "formula", "R model formula") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, formula=None, featuresCol="features", labelCol="label"): + """ + setParams(self, formula=None, featuresCol="features", labelCol="label") + Sets params for RFormula. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setFormula(self, value): + """ + Sets the value of :py:attr:`formula`. + """ + self._paramMap[self.formula] = value + return self + + def getFormula(self): + """ + Gets the value of :py:attr:`formula`. + """ + return self.getOrDefault(self.formula) + + def _create_model(self, java_model): + return RFormulaModel(java_model) + + +class RFormulaModel(JavaModel): + """ + Model fitted by :py:class:`RFormula`. + """ + + if __name__ == "__main__": import doctest from pyspark.context import SparkContext diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 7845536161e0..eeeac49b2198 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -60,14 +60,16 @@ class Params(Identifiable): __metaclass__ = ABCMeta - #: internal param map for user-supplied values param map - _paramMap = {} + def __init__(self): + super(Params, self).__init__() + #: internal param map for user-supplied values param map + self._paramMap = {} - #: internal param map for default values - _defaultParamMap = {} + #: internal param map for default values + self._defaultParamMap = {} - #: value returned by :py:func:`params` - _params = None + #: value returned by :py:func:`params` + self._params = None @property def params(self): @@ -155,7 +157,7 @@ def getOrDefault(self, param): else: return self._defaultParamMap[param] - def extractParamMap(self, extra={}): + def extractParamMap(self, extra=None): """ Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into @@ -165,12 +167,14 @@ def extractParamMap(self, extra={}): :param extra: extra param values :return: merged param map """ + if extra is None: + extra = dict() paramMap = self._defaultParamMap.copy() paramMap.update(self._paramMap) paramMap.update(extra) return paramMap - def copy(self, extra={}): + def copy(self, extra=None): """ Creates a copy of this instance with the same uid and some extra params. The default implementation creates a @@ -181,6 +185,8 @@ def copy(self, extra={}): :param extra: Extra parameters to copy to the new instance :return: Copy of this instance """ + if extra is None: + extra = dict() that = copy.copy(self) that._paramMap = self.extractParamMap(extra) return that @@ -233,7 +239,7 @@ def _setDefault(self, **kwargs): self._defaultParamMap[getattr(self, param)] = value return self - def _copyValues(self, to, extra={}): + def _copyValues(self, to, extra=None): """ Copies param values from this instance to another instance for params shared by them. @@ -241,6 +247,8 @@ def _copyValues(self, to, extra={}): :param extra: extra params to be copied :return: the target instance with param values copied """ + if extra is None: + extra = dict() paramMap = self.extractParamMap(extra) for p in self.params: if p in paramMap and to.hasParam(p.name): diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py index 9889f56cac9e..13cf2b0f7bbd 100644 --- a/python/pyspark/ml/pipeline.py +++ b/python/pyspark/ml/pipeline.py @@ -141,7 +141,7 @@ class Pipeline(Estimator): @keyword_only def __init__(self, stages=None): """ - __init__(self, stages=[]) + __init__(self, stages=None) """ if stages is None: stages = [] @@ -170,7 +170,7 @@ def getStages(self): @keyword_only def setParams(self, stages=None): """ - setParams(self, stages=[]) + setParams(self, stages=None) Sets params for Pipeline. """ if stages is None: diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index c151d21fd661..60e4237293ad 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -32,11 +32,14 @@ from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase from pyspark.sql import DataFrame, SQLContext +from pyspark.sql.functions import rand +from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.param import Param, Params from pyspark.ml.param.shared import HasMaxIter, HasInputCol, HasSeed from pyspark.ml.util import keyword_only from pyspark.ml import Estimator, Model, Pipeline, Transformer from pyspark.ml.feature import * +from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel from pyspark.mllib.linalg import DenseVector @@ -264,5 +267,89 @@ def test_ngram(self): self.assertEquals(transformedDF.head().output, ["a b c d", "b c d e"]) +class HasInducedError(Params): + + def __init__(self): + super(HasInducedError, self).__init__() + self.inducedError = Param(self, "inducedError", + "Uniformly-distributed error added to feature") + + def getInducedError(self): + return self.getOrDefault(self.inducedError) + + +class InducedErrorModel(Model, HasInducedError): + + def __init__(self): + super(InducedErrorModel, self).__init__() + + def _transform(self, dataset): + return dataset.withColumn("prediction", + dataset.feature + (rand(0) * self.getInducedError())) + + +class InducedErrorEstimator(Estimator, HasInducedError): + + def __init__(self, inducedError=1.0): + super(InducedErrorEstimator, self).__init__() + self._set(inducedError=inducedError) + + def _fit(self, dataset): + model = InducedErrorModel() + self._copyValues(model) + return model + + +class CrossValidatorTests(PySparkTestCase): + + def test_fit_minimize_metric(self): + sqlContext = SQLContext(self.sc) + dataset = sqlContext.createDataFrame([ + (10, 10.0), + (50, 50.0), + (100, 100.0), + (500, 500.0)] * 10, + ["feature", "label"]) + + iee = InducedErrorEstimator() + evaluator = RegressionEvaluator(metricName="rmse") + + grid = (ParamGridBuilder() + .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) + .build()) + cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) + cvModel = cv.fit(dataset) + bestModel = cvModel.bestModel + bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) + + self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), + "Best model should have zero induced error") + self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0") + + def test_fit_maximize_metric(self): + sqlContext = SQLContext(self.sc) + dataset = sqlContext.createDataFrame([ + (10, 10.0), + (50, 50.0), + (100, 100.0), + (500, 500.0)] * 10, + ["feature", "label"]) + + iee = InducedErrorEstimator() + evaluator = RegressionEvaluator(metricName="r2") + + grid = (ParamGridBuilder() + .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) + .build()) + cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) + cvModel = cv.fit(dataset) + bestModel = cvModel.bestModel + bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) + + self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), + "Best model should have zero induced error") + self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") + + if __name__ == "__main__": unittest.main() diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 0bf988fd72f1..cae778869e9c 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -223,11 +223,17 @@ def _fit(self, dataset): # TODO: duplicate evaluator to take extra params from input metric = eva.evaluate(model.transform(validation, epm[j])) metrics[j] += metric - bestIndex = np.argmax(metrics) + + if eva.isLargerBetter(): + bestIndex = np.argmax(metrics) + else: + bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return CrossValidatorModel(bestModel) - def copy(self, extra={}): + def copy(self, extra=None): + if extra is None: + extra = dict() newCV = Params.copy(self, extra) if self.isSet(self.estimator): newCV.setEstimator(self.getEstimator().copy(extra)) @@ -250,7 +256,7 @@ def __init__(self, bestModel): def _transform(self, dataset): return self.bestModel.transform(dataset) - def copy(self, extra={}): + def copy(self, extra=None): """ Creates a copy of this instance with a randomly generated uid and some extra params. This copies the underlying bestModel, @@ -259,6 +265,8 @@ def copy(self, extra={}): :param extra: Extra parameters to copy to the new instance :return: Copy of this instance """ + if extra is None: + extra = dict() return CrossValidatorModel(self.bestModel.copy(extra)) diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py index 855e85f57155..a439a488de5c 100644 --- a/python/pyspark/mllib/common.py +++ b/python/pyspark/mllib/common.py @@ -73,6 +73,8 @@ def _py2java(sc, obj): """ Convert Python object into Java """ if isinstance(obj, RDD): obj = _to_java_object_rdd(obj) + elif isinstance(obj, DataFrame): + obj = obj._jdf elif isinstance(obj, SparkContext): obj = obj._jsc elif isinstance(obj, list): diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py new file mode 100644 index 000000000000..aec407de90aa --- /dev/null +++ b/python/pyspark/mllib/linalg/distributed.py @@ -0,0 +1,853 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Package for distributed linear algebra. +""" + +import sys + +if sys.version >= '3': + long = int + +from py4j.java_gateway import JavaObject + +from pyspark import RDD +from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper +from pyspark.mllib.linalg import _convert_to_vector, Matrix + + +__all__ = ['DistributedMatrix', 'RowMatrix', 'IndexedRow', + 'IndexedRowMatrix', 'MatrixEntry', 'CoordinateMatrix', + 'BlockMatrix'] + + +class DistributedMatrix(object): + """ + .. note:: Experimental + + Represents a distributively stored matrix backed by one or + more RDDs. + + """ + def numRows(self): + """Get or compute the number of rows.""" + raise NotImplementedError + + def numCols(self): + """Get or compute the number of cols.""" + raise NotImplementedError + + +class RowMatrix(DistributedMatrix): + """ + .. note:: Experimental + + Represents a row-oriented distributed Matrix with no meaningful + row indices. + + :param rows: An RDD of vectors. + :param numRows: Number of rows in the matrix. A non-positive + value means unknown, at which point the number + of rows will be determined by the number of + records in the `rows` RDD. + :param numCols: Number of columns in the matrix. A non-positive + value means unknown, at which point the number + of columns will be determined by the size of + the first row. + """ + def __init__(self, rows, numRows=0, numCols=0): + """ + Note: This docstring is not shown publicly. + + Create a wrapper over a Java RowMatrix. + + Publicly, we require that `rows` be an RDD. However, for + internal usage, `rows` can also be a Java RowMatrix + object, in which case we can wrap it directly. This + assists in clean matrix conversions. + + >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) + >>> mat = RowMatrix(rows) + + >>> mat_diff = RowMatrix(rows) + >>> (mat_diff._java_matrix_wrapper._java_model == + ... mat._java_matrix_wrapper._java_model) + False + + >>> mat_same = RowMatrix(mat._java_matrix_wrapper._java_model) + >>> (mat_same._java_matrix_wrapper._java_model == + ... mat._java_matrix_wrapper._java_model) + True + """ + if isinstance(rows, RDD): + rows = rows.map(_convert_to_vector) + java_matrix = callMLlibFunc("createRowMatrix", rows, long(numRows), int(numCols)) + elif (isinstance(rows, JavaObject) + and rows.getClass().getSimpleName() == "RowMatrix"): + java_matrix = rows + else: + raise TypeError("rows should be an RDD of vectors, got %s" % type(rows)) + + self._java_matrix_wrapper = JavaModelWrapper(java_matrix) + + @property + def rows(self): + """ + Rows of the RowMatrix stored as an RDD of vectors. + + >>> mat = RowMatrix(sc.parallelize([[1, 2, 3], [4, 5, 6]])) + >>> rows = mat.rows + >>> rows.first() + DenseVector([1.0, 2.0, 3.0]) + """ + return self._java_matrix_wrapper.call("rows") + + def numRows(self): + """ + Get or compute the number of rows. + + >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6], + ... [7, 8, 9], [10, 11, 12]]) + + >>> mat = RowMatrix(rows) + >>> print(mat.numRows()) + 4 + + >>> mat = RowMatrix(rows, 7, 6) + >>> print(mat.numRows()) + 7 + """ + return self._java_matrix_wrapper.call("numRows") + + def numCols(self): + """ + Get or compute the number of cols. + + >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6], + ... [7, 8, 9], [10, 11, 12]]) + + >>> mat = RowMatrix(rows) + >>> print(mat.numCols()) + 3 + + >>> mat = RowMatrix(rows, 7, 6) + >>> print(mat.numCols()) + 6 + """ + return self._java_matrix_wrapper.call("numCols") + + +class IndexedRow(object): + """ + .. note:: Experimental + + Represents a row of an IndexedRowMatrix. + + Just a wrapper over a (long, vector) tuple. + + :param index: The index for the given row. + :param vector: The row in the matrix at the given index. + """ + def __init__(self, index, vector): + self.index = long(index) + self.vector = _convert_to_vector(vector) + + def __repr__(self): + return "IndexedRow(%s, %s)" % (self.index, self.vector) + + +def _convert_to_indexed_row(row): + if isinstance(row, IndexedRow): + return row + elif isinstance(row, tuple) and len(row) == 2: + return IndexedRow(*row) + else: + raise TypeError("Cannot convert type %s into IndexedRow" % type(row)) + + +class IndexedRowMatrix(DistributedMatrix): + """ + .. note:: Experimental + + Represents a row-oriented distributed Matrix with indexed rows. + + :param rows: An RDD of IndexedRows or (long, vector) tuples. + :param numRows: Number of rows in the matrix. A non-positive + value means unknown, at which point the number + of rows will be determined by the max row + index plus one. + :param numCols: Number of columns in the matrix. A non-positive + value means unknown, at which point the number + of columns will be determined by the size of + the first row. + """ + def __init__(self, rows, numRows=0, numCols=0): + """ + Note: This docstring is not shown publicly. + + Create a wrapper over a Java IndexedRowMatrix. + + Publicly, we require that `rows` be an RDD. However, for + internal usage, `rows` can also be a Java IndexedRowMatrix + object, in which case we can wrap it directly. This + assists in clean matrix conversions. + + >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), + ... IndexedRow(1, [4, 5, 6])]) + >>> mat = IndexedRowMatrix(rows) + + >>> mat_diff = IndexedRowMatrix(rows) + >>> (mat_diff._java_matrix_wrapper._java_model == + ... mat._java_matrix_wrapper._java_model) + False + + >>> mat_same = IndexedRowMatrix(mat._java_matrix_wrapper._java_model) + >>> (mat_same._java_matrix_wrapper._java_model == + ... mat._java_matrix_wrapper._java_model) + True + """ + if isinstance(rows, RDD): + rows = rows.map(_convert_to_indexed_row) + # We use DataFrames for serialization of IndexedRows from + # Python, so first convert the RDD to a DataFrame on this + # side. This will convert each IndexedRow to a Row + # containing the 'index' and 'vector' values, which can + # both be easily serialized. We will convert back to + # IndexedRows on the Scala side. + java_matrix = callMLlibFunc("createIndexedRowMatrix", rows.toDF(), + long(numRows), int(numCols)) + elif (isinstance(rows, JavaObject) + and rows.getClass().getSimpleName() == "IndexedRowMatrix"): + java_matrix = rows + else: + raise TypeError("rows should be an RDD of IndexedRows or (long, vector) tuples, " + "got %s" % type(rows)) + + self._java_matrix_wrapper = JavaModelWrapper(java_matrix) + + @property + def rows(self): + """ + Rows of the IndexedRowMatrix stored as an RDD of IndexedRows. + + >>> mat = IndexedRowMatrix(sc.parallelize([IndexedRow(0, [1, 2, 3]), + ... IndexedRow(1, [4, 5, 6])])) + >>> rows = mat.rows + >>> rows.first() + IndexedRow(0, [1.0,2.0,3.0]) + """ + # We use DataFrames for serialization of IndexedRows from + # Java, so we first convert the RDD of rows to a DataFrame + # on the Scala/Java side. Then we map each Row in the + # DataFrame back to an IndexedRow on this side. + rows_df = callMLlibFunc("getIndexedRows", self._java_matrix_wrapper._java_model) + rows = rows_df.map(lambda row: IndexedRow(row[0], row[1])) + return rows + + def numRows(self): + """ + Get or compute the number of rows. + + >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), + ... IndexedRow(1, [4, 5, 6]), + ... IndexedRow(2, [7, 8, 9]), + ... IndexedRow(3, [10, 11, 12])]) + + >>> mat = IndexedRowMatrix(rows) + >>> print(mat.numRows()) + 4 + + >>> mat = IndexedRowMatrix(rows, 7, 6) + >>> print(mat.numRows()) + 7 + """ + return self._java_matrix_wrapper.call("numRows") + + def numCols(self): + """ + Get or compute the number of cols. + + >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), + ... IndexedRow(1, [4, 5, 6]), + ... IndexedRow(2, [7, 8, 9]), + ... IndexedRow(3, [10, 11, 12])]) + + >>> mat = IndexedRowMatrix(rows) + >>> print(mat.numCols()) + 3 + + >>> mat = IndexedRowMatrix(rows, 7, 6) + >>> print(mat.numCols()) + 6 + """ + return self._java_matrix_wrapper.call("numCols") + + def toRowMatrix(self): + """ + Convert this matrix to a RowMatrix. + + >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), + ... IndexedRow(6, [4, 5, 6])]) + >>> mat = IndexedRowMatrix(rows).toRowMatrix() + >>> mat.rows.collect() + [DenseVector([1.0, 2.0, 3.0]), DenseVector([4.0, 5.0, 6.0])] + """ + java_row_matrix = self._java_matrix_wrapper.call("toRowMatrix") + return RowMatrix(java_row_matrix) + + def toCoordinateMatrix(self): + """ + Convert this matrix to a CoordinateMatrix. + + >>> rows = sc.parallelize([IndexedRow(0, [1, 0]), + ... IndexedRow(6, [0, 5])]) + >>> mat = IndexedRowMatrix(rows).toCoordinateMatrix() + >>> mat.entries.take(3) + [MatrixEntry(0, 0, 1.0), MatrixEntry(0, 1, 0.0), MatrixEntry(6, 0, 0.0)] + """ + java_coordinate_matrix = self._java_matrix_wrapper.call("toCoordinateMatrix") + return CoordinateMatrix(java_coordinate_matrix) + + def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): + """ + Convert this matrix to a BlockMatrix. + + :param rowsPerBlock: Number of rows that make up each block. + The blocks forming the final rows are not + required to have the given number of rows. + :param colsPerBlock: Number of columns that make up each block. + The blocks forming the final columns are not + required to have the given number of columns. + + >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), + ... IndexedRow(6, [4, 5, 6])]) + >>> mat = IndexedRowMatrix(rows).toBlockMatrix() + + >>> # This IndexedRowMatrix will have 7 effective rows, due to + >>> # the highest row index being 6, and the ensuing + >>> # BlockMatrix will have 7 rows as well. + >>> print(mat.numRows()) + 7 + + >>> print(mat.numCols()) + 3 + """ + java_block_matrix = self._java_matrix_wrapper.call("toBlockMatrix", + rowsPerBlock, + colsPerBlock) + return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock) + + +class MatrixEntry(object): + """ + .. note:: Experimental + + Represents an entry of a CoordinateMatrix. + + Just a wrapper over a (long, long, float) tuple. + + :param i: The row index of the matrix. + :param j: The column index of the matrix. + :param value: The (i, j)th entry of the matrix, as a float. + """ + def __init__(self, i, j, value): + self.i = long(i) + self.j = long(j) + self.value = float(value) + + def __repr__(self): + return "MatrixEntry(%s, %s, %s)" % (self.i, self.j, self.value) + + +def _convert_to_matrix_entry(entry): + if isinstance(entry, MatrixEntry): + return entry + elif isinstance(entry, tuple) and len(entry) == 3: + return MatrixEntry(*entry) + else: + raise TypeError("Cannot convert type %s into MatrixEntry" % type(entry)) + + +class CoordinateMatrix(DistributedMatrix): + """ + .. note:: Experimental + + Represents a matrix in coordinate format. + + :param entries: An RDD of MatrixEntry inputs or + (long, long, float) tuples. + :param numRows: Number of rows in the matrix. A non-positive + value means unknown, at which point the number + of rows will be determined by the max row + index plus one. + :param numCols: Number of columns in the matrix. A non-positive + value means unknown, at which point the number + of columns will be determined by the max row + index plus one. + """ + def __init__(self, entries, numRows=0, numCols=0): + """ + Note: This docstring is not shown publicly. + + Create a wrapper over a Java CoordinateMatrix. + + Publicly, we require that `rows` be an RDD. However, for + internal usage, `rows` can also be a Java CoordinateMatrix + object, in which case we can wrap it directly. This + assists in clean matrix conversions. + + >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), + ... MatrixEntry(6, 4, 2.1)]) + >>> mat = CoordinateMatrix(entries) + + >>> mat_diff = CoordinateMatrix(entries) + >>> (mat_diff._java_matrix_wrapper._java_model == + ... mat._java_matrix_wrapper._java_model) + False + + >>> mat_same = CoordinateMatrix(mat._java_matrix_wrapper._java_model) + >>> (mat_same._java_matrix_wrapper._java_model == + ... mat._java_matrix_wrapper._java_model) + True + """ + if isinstance(entries, RDD): + entries = entries.map(_convert_to_matrix_entry) + # We use DataFrames for serialization of MatrixEntry entries + # from Python, so first convert the RDD to a DataFrame on + # this side. This will convert each MatrixEntry to a Row + # containing the 'i', 'j', and 'value' values, which can + # each be easily serialized. We will convert back to + # MatrixEntry inputs on the Scala side. + java_matrix = callMLlibFunc("createCoordinateMatrix", entries.toDF(), + long(numRows), long(numCols)) + elif (isinstance(entries, JavaObject) + and entries.getClass().getSimpleName() == "CoordinateMatrix"): + java_matrix = entries + else: + raise TypeError("entries should be an RDD of MatrixEntry entries or " + "(long, long, float) tuples, got %s" % type(entries)) + + self._java_matrix_wrapper = JavaModelWrapper(java_matrix) + + @property + def entries(self): + """ + Entries of the CoordinateMatrix stored as an RDD of + MatrixEntries. + + >>> mat = CoordinateMatrix(sc.parallelize([MatrixEntry(0, 0, 1.2), + ... MatrixEntry(6, 4, 2.1)])) + >>> entries = mat.entries + >>> entries.first() + MatrixEntry(0, 0, 1.2) + """ + # We use DataFrames for serialization of MatrixEntry entries + # from Java, so we first convert the RDD of entries to a + # DataFrame on the Scala/Java side. Then we map each Row in + # the DataFrame back to a MatrixEntry on this side. + entries_df = callMLlibFunc("getMatrixEntries", self._java_matrix_wrapper._java_model) + entries = entries_df.map(lambda row: MatrixEntry(row[0], row[1], row[2])) + return entries + + def numRows(self): + """ + Get or compute the number of rows. + + >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), + ... MatrixEntry(1, 0, 2), + ... MatrixEntry(2, 1, 3.7)]) + + >>> mat = CoordinateMatrix(entries) + >>> print(mat.numRows()) + 3 + + >>> mat = CoordinateMatrix(entries, 7, 6) + >>> print(mat.numRows()) + 7 + """ + return self._java_matrix_wrapper.call("numRows") + + def numCols(self): + """ + Get or compute the number of cols. + + >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), + ... MatrixEntry(1, 0, 2), + ... MatrixEntry(2, 1, 3.7)]) + + >>> mat = CoordinateMatrix(entries) + >>> print(mat.numCols()) + 2 + + >>> mat = CoordinateMatrix(entries, 7, 6) + >>> print(mat.numCols()) + 6 + """ + return self._java_matrix_wrapper.call("numCols") + + def toRowMatrix(self): + """ + Convert this matrix to a RowMatrix. + + >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), + ... MatrixEntry(6, 4, 2.1)]) + >>> mat = CoordinateMatrix(entries).toRowMatrix() + + >>> # This CoordinateMatrix will have 7 effective rows, due to + >>> # the highest row index being 6, but the ensuing RowMatrix + >>> # will only have 2 rows since there are only entries on 2 + >>> # unique rows. + >>> print(mat.numRows()) + 2 + + >>> # This CoordinateMatrix will have 5 columns, due to the + >>> # highest column index being 4, and the ensuing RowMatrix + >>> # will have 5 columns as well. + >>> print(mat.numCols()) + 5 + """ + java_row_matrix = self._java_matrix_wrapper.call("toRowMatrix") + return RowMatrix(java_row_matrix) + + def toIndexedRowMatrix(self): + """ + Convert this matrix to an IndexedRowMatrix. + + >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), + ... MatrixEntry(6, 4, 2.1)]) + >>> mat = CoordinateMatrix(entries).toIndexedRowMatrix() + + >>> # This CoordinateMatrix will have 7 effective rows, due to + >>> # the highest row index being 6, and the ensuing + >>> # IndexedRowMatrix will have 7 rows as well. + >>> print(mat.numRows()) + 7 + + >>> # This CoordinateMatrix will have 5 columns, due to the + >>> # highest column index being 4, and the ensuing + >>> # IndexedRowMatrix will have 5 columns as well. + >>> print(mat.numCols()) + 5 + """ + java_indexed_row_matrix = self._java_matrix_wrapper.call("toIndexedRowMatrix") + return IndexedRowMatrix(java_indexed_row_matrix) + + def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): + """ + Convert this matrix to a BlockMatrix. + + :param rowsPerBlock: Number of rows that make up each block. + The blocks forming the final rows are not + required to have the given number of rows. + :param colsPerBlock: Number of columns that make up each block. + The blocks forming the final columns are not + required to have the given number of columns. + + >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), + ... MatrixEntry(6, 4, 2.1)]) + >>> mat = CoordinateMatrix(entries).toBlockMatrix() + + >>> # This CoordinateMatrix will have 7 effective rows, due to + >>> # the highest row index being 6, and the ensuing + >>> # BlockMatrix will have 7 rows as well. + >>> print(mat.numRows()) + 7 + + >>> # This CoordinateMatrix will have 5 columns, due to the + >>> # highest column index being 4, and the ensuing + >>> # BlockMatrix will have 5 columns as well. + >>> print(mat.numCols()) + 5 + """ + java_block_matrix = self._java_matrix_wrapper.call("toBlockMatrix", + rowsPerBlock, + colsPerBlock) + return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock) + + +def _convert_to_matrix_block_tuple(block): + if (isinstance(block, tuple) and len(block) == 2 + and isinstance(block[0], tuple) and len(block[0]) == 2 + and isinstance(block[1], Matrix)): + blockRowIndex = int(block[0][0]) + blockColIndex = int(block[0][1]) + subMatrix = block[1] + return ((blockRowIndex, blockColIndex), subMatrix) + else: + raise TypeError("Cannot convert type %s into a sub-matrix block tuple" % type(block)) + + +class BlockMatrix(DistributedMatrix): + """ + .. note:: Experimental + + Represents a distributed matrix in blocks of local matrices. + + :param blocks: An RDD of sub-matrix blocks + ((blockRowIndex, blockColIndex), sub-matrix) that + form this distributed matrix. If multiple blocks + with the same index exist, the results for + operations like add and multiply will be + unpredictable. + :param rowsPerBlock: Number of rows that make up each block. + The blocks forming the final rows are not + required to have the given number of rows. + :param colsPerBlock: Number of columns that make up each block. + The blocks forming the final columns are not + required to have the given number of columns. + :param numRows: Number of rows of this matrix. If the supplied + value is less than or equal to zero, the number + of rows will be calculated when `numRows` is + invoked. + :param numCols: Number of columns of this matrix. If the supplied + value is less than or equal to zero, the number + of columns will be calculated when `numCols` is + invoked. + """ + def __init__(self, blocks, rowsPerBlock, colsPerBlock, numRows=0, numCols=0): + """ + Note: This docstring is not shown publicly. + + Create a wrapper over a Java BlockMatrix. + + Publicly, we require that `blocks` be an RDD. However, for + internal usage, `blocks` can also be a Java BlockMatrix + object, in which case we can wrap it directly. This + assists in clean matrix conversions. + + >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + >>> mat = BlockMatrix(blocks, 3, 2) + + >>> mat_diff = BlockMatrix(blocks, 3, 2) + >>> (mat_diff._java_matrix_wrapper._java_model == + ... mat._java_matrix_wrapper._java_model) + False + + >>> mat_same = BlockMatrix(mat._java_matrix_wrapper._java_model, 3, 2) + >>> (mat_same._java_matrix_wrapper._java_model == + ... mat._java_matrix_wrapper._java_model) + True + """ + if isinstance(blocks, RDD): + blocks = blocks.map(_convert_to_matrix_block_tuple) + # We use DataFrames for serialization of sub-matrix blocks + # from Python, so first convert the RDD to a DataFrame on + # this side. This will convert each sub-matrix block + # tuple to a Row containing the 'blockRowIndex', + # 'blockColIndex', and 'subMatrix' values, which can + # each be easily serialized. We will convert back to + # ((blockRowIndex, blockColIndex), sub-matrix) tuples on + # the Scala side. + java_matrix = callMLlibFunc("createBlockMatrix", blocks.toDF(), + int(rowsPerBlock), int(colsPerBlock), + long(numRows), long(numCols)) + elif (isinstance(blocks, JavaObject) + and blocks.getClass().getSimpleName() == "BlockMatrix"): + java_matrix = blocks + else: + raise TypeError("blocks should be an RDD of sub-matrix blocks as " + "((int, int), matrix) tuples, got %s" % type(blocks)) + + self._java_matrix_wrapper = JavaModelWrapper(java_matrix) + + @property + def blocks(self): + """ + The RDD of sub-matrix blocks + ((blockRowIndex, blockColIndex), sub-matrix) that form this + distributed matrix. + + >>> mat = BlockMatrix( + ... sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]), 3, 2) + >>> blocks = mat.blocks + >>> blocks.first() + ((0, 0), DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 0)) + + """ + # We use DataFrames for serialization of sub-matrix blocks + # from Java, so we first convert the RDD of blocks to a + # DataFrame on the Scala/Java side. Then we map each Row in + # the DataFrame back to a sub-matrix block on this side. + blocks_df = callMLlibFunc("getMatrixBlocks", self._java_matrix_wrapper._java_model) + blocks = blocks_df.map(lambda row: ((row[0][0], row[0][1]), row[1])) + return blocks + + @property + def rowsPerBlock(self): + """ + Number of rows that make up each block. + + >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + >>> mat = BlockMatrix(blocks, 3, 2) + >>> mat.rowsPerBlock + 3 + """ + return self._java_matrix_wrapper.call("rowsPerBlock") + + @property + def colsPerBlock(self): + """ + Number of columns that make up each block. + + >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + >>> mat = BlockMatrix(blocks, 3, 2) + >>> mat.colsPerBlock + 2 + """ + return self._java_matrix_wrapper.call("colsPerBlock") + + @property + def numRowBlocks(self): + """ + Number of rows of blocks in the BlockMatrix. + + >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + >>> mat = BlockMatrix(blocks, 3, 2) + >>> mat.numRowBlocks + 2 + """ + return self._java_matrix_wrapper.call("numRowBlocks") + + @property + def numColBlocks(self): + """ + Number of columns of blocks in the BlockMatrix. + + >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + >>> mat = BlockMatrix(blocks, 3, 2) + >>> mat.numColBlocks + 1 + """ + return self._java_matrix_wrapper.call("numColBlocks") + + def numRows(self): + """ + Get or compute the number of rows. + + >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + + >>> mat = BlockMatrix(blocks, 3, 2) + >>> print(mat.numRows()) + 6 + + >>> mat = BlockMatrix(blocks, 3, 2, 7, 6) + >>> print(mat.numRows()) + 7 + """ + return self._java_matrix_wrapper.call("numRows") + + def numCols(self): + """ + Get or compute the number of cols. + + >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + + >>> mat = BlockMatrix(blocks, 3, 2) + >>> print(mat.numCols()) + 2 + + >>> mat = BlockMatrix(blocks, 3, 2, 7, 6) + >>> print(mat.numCols()) + 6 + """ + return self._java_matrix_wrapper.call("numCols") + + def toLocalMatrix(self): + """ + Collect the distributed matrix on the driver as a DenseMatrix. + + >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + >>> mat = BlockMatrix(blocks, 3, 2).toLocalMatrix() + + >>> # This BlockMatrix will have 6 effective rows, due to + >>> # having two sub-matrix blocks stacked, each with 3 rows. + >>> # The ensuing DenseMatrix will also have 6 rows. + >>> print(mat.numRows) + 6 + + >>> # This BlockMatrix will have 2 effective columns, due to + >>> # having two sub-matrix blocks stacked, each with 2 + >>> # columns. The ensuing DenseMatrix will also have 2 columns. + >>> print(mat.numCols) + 2 + """ + return self._java_matrix_wrapper.call("toLocalMatrix") + + def toIndexedRowMatrix(self): + """ + Convert this matrix to an IndexedRowMatrix. + + >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), + ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) + >>> mat = BlockMatrix(blocks, 3, 2).toIndexedRowMatrix() + + >>> # This BlockMatrix will have 6 effective rows, due to + >>> # having two sub-matrix blocks stacked, each with 3 rows. + >>> # The ensuing IndexedRowMatrix will also have 6 rows. + >>> print(mat.numRows()) + 6 + + >>> # This BlockMatrix will have 2 effective columns, due to + >>> # having two sub-matrix blocks stacked, each with 2 columns. + >>> # The ensuing IndexedRowMatrix will also have 2 columns. + >>> print(mat.numCols()) + 2 + """ + java_indexed_row_matrix = self._java_matrix_wrapper.call("toIndexedRowMatrix") + return IndexedRowMatrix(java_indexed_row_matrix) + + def toCoordinateMatrix(self): + """ + Convert this matrix to a CoordinateMatrix. + + >>> blocks = sc.parallelize([((0, 0), Matrices.dense(1, 2, [1, 2])), + ... ((1, 0), Matrices.dense(1, 2, [7, 8]))]) + >>> mat = BlockMatrix(blocks, 1, 2).toCoordinateMatrix() + >>> mat.entries.take(3) + [MatrixEntry(0, 0, 1.0), MatrixEntry(0, 1, 2.0), MatrixEntry(1, 0, 7.0)] + """ + java_coordinate_matrix = self._java_matrix_wrapper.call("toCoordinateMatrix") + return CoordinateMatrix(java_coordinate_matrix) + + +def _test(): + import doctest + from pyspark import SparkContext + from pyspark.sql import SQLContext + from pyspark.mllib.linalg import Matrices + import pyspark.mllib.linalg.distributed + globs = pyspark.mllib.linalg.distributed.__dict__.copy() + globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2) + globs['sqlContext'] = SQLContext(globs['sc']) + globs['Matrices'] = Matrices + (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) + globs['sc'].stop() + if failure_count: + exit(-1) + +if __name__ == "__main__": + _test() diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 5b7afc15ddfb..41946e3674fb 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -207,8 +207,10 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, Train a linear regression model using Stochastic Gradient Descent (SGD). This solves the least squares regression formulation - f(weights) = 1/n ||A weights-y||^2^ - (which is the mean squared error). + + f(weights) = 1/(2n) ||A weights - y||^2, + + which is the mean squared error. Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with its corresponding right hand side label y. See also the documentation for the precise formulation. @@ -334,7 +336,9 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, Stochastic Gradient Descent. This solves the l1-regularized least squares regression formulation - f(weights) = 1/2n ||A weights-y||^2^ + regParam ||weights||_1 + + f(weights) = 1/(2n) ||A weights - y||^2 + regParam ||weights||_1. + Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with its corresponding right hand side label y. See also the documentation for the precise formulation. @@ -451,7 +455,9 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, Stochastic Gradient Descent. This solves the l2-regularized least squares regression formulation - f(weights) = 1/2n ||A weights-y||^2^ + regParam/2 ||weights||^2^ + + f(weights) = 1/(2n) ||A weights - y||^2 + regParam/2 ||weights||^2. + Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with its corresponding right hand side label y. See also the documentation for the precise formulation. diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 3f5a02af12e3..5097c5e8ba4c 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -32,6 +32,9 @@ from py4j.protocol import Py4JJavaError +if sys.version > '3': + basestring = str + if sys.version_info[:2] <= (2, 6): try: import unittest2 as unittest @@ -86,9 +89,42 @@ def tearDown(self): self.ssc.stop(False) @staticmethod - def _ssc_wait(start_time, end_time, sleep_time): - while time() - start_time < end_time: + def _eventually(condition, timeout=30.0, catch_assertions=False): + """ + Wait a given amount of time for a condition to pass, else fail with an error. + This is a helper utility for streaming ML tests. + :param condition: Function that checks for termination conditions. + condition() can return: + - True: Conditions met. Return without error. + - other value: Conditions not met yet. Continue. Upon timeout, + include last such value in error message. + Note that this method may be called at any time during + streaming execution (e.g., even before any results + have been created). + :param timeout: Number of seconds to wait. Default 30 seconds. + :param catch_assertions: If False (default), do not catch AssertionErrors. + If True, catch AssertionErrors; continue, but save + error to throw upon timeout. + """ + start_time = time() + lastValue = None + while time() - start_time < timeout: + if catch_assertions: + try: + lastValue = condition() + except AssertionError as e: + lastValue = e + else: + lastValue = condition() + if lastValue is True: + return sleep(0.01) + if isinstance(lastValue, AssertionError): + raise lastValue + else: + raise AssertionError( + "Test failed due to timeout after %g sec, with last condition returning: %s" + % (timeout, lastValue)) def _squared_distance(a, b): @@ -999,10 +1035,13 @@ def test_accuracy_for_single_center(self): [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) - t = time() self.ssc.start() - self._ssc_wait(t, 10.0, 0.01) - self.assertEquals(stkm.latestModel().clusterWeights, [25.0]) + + def condition(): + self.assertEquals(stkm.latestModel().clusterWeights, [25.0]) + return True + self._eventually(condition, catch_assertions=True) + realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCenters = stkm.latestModel().centers[0][i] @@ -1027,7 +1066,7 @@ def test_trainOn_model(self): stkm.setInitialCenters( centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0]) - # Create a toy dataset by setting a tiny offest for each point. + # Create a toy dataset by setting a tiny offset for each point. offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]] batches = [] for offset in offsets: @@ -1037,14 +1076,15 @@ def test_trainOn_model(self): batches = [self.sc.parallelize(batch, 1) for batch in batches] input_stream = self.ssc.queueStream(batches) stkm.trainOn(input_stream) - t = time() self.ssc.start() # Give enough time to train the model. - self._ssc_wait(t, 6.0, 0.01) - finalModel = stkm.latestModel() - self.assertTrue(all(finalModel.centers == array(initCenters))) - self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0]) + def condition(): + finalModel = stkm.latestModel() + self.assertTrue(all(finalModel.centers == array(initCenters))) + self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0]) + return True + self._eventually(condition, catch_assertions=True) def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" @@ -1066,10 +1106,13 @@ def update(rdd): result.append(rdd_collect) predict_val.foreachRDD(update) - t = time() self.ssc.start() - self._ssc_wait(t, 6.0, 0.01) - self.assertEquals(result, [[0], [1], [2], [3]]) + + def condition(): + self.assertEquals(result, [[0], [1], [2], [3]]) + return True + + self._eventually(condition, catch_assertions=True) def test_trainOn_predictOn(self): """Test that prediction happens on the updated model.""" @@ -1095,10 +1138,13 @@ def collect(rdd): predict_stream = stkm.predictOn(input_stream) predict_stream.foreachRDD(collect) - t = time() self.ssc.start() - self._ssc_wait(t, 6.0, 0.01) - self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]]) + + def condition(): + self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]]) + return True + + self._eventually(condition, catch_assertions=True) class LinearDataGeneratorTests(MLlibTestCase): @@ -1156,11 +1202,14 @@ def test_parameter_accuracy(self): slr.setInitialWeights([0.0]) slr.trainOn(input_stream) - t = time() self.ssc.start() - self._ssc_wait(t, 20.0, 0.01) - rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5 - self.assertAlmostEqual(rel, 0.1, 1) + + def condition(): + rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5 + self.assertAlmostEqual(rel, 0.1, 1) + return True + + self._eventually(condition, catch_assertions=True) def test_convergence(self): """ @@ -1179,13 +1228,18 @@ def test_convergence(self): input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) - t = time() self.ssc.start() - self._ssc_wait(t, 15.0, 0.01) + + def condition(): + self.assertEquals(len(models), len(input_batches)) + return True + + # We want all batches to finish for this test. + self._eventually(condition, 60.0, catch_assertions=True) + t_models = array(models) diff = t_models[1:] - t_models[:-1] - - # Test that weights improve with a small tolerance, + # Test that weights improve with a small tolerance self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1) @@ -1208,9 +1262,13 @@ def test_predictions(self): predict_stream = slr.predictOnValues(input_stream) true_predicted = [] predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect())) - t = time() self.ssc.start() - self._ssc_wait(t, 5.0, 0.01) + + def condition(): + self.assertEquals(len(true_predicted), len(input_batches)) + return True + + self._eventually(condition, catch_assertions=True) # Test that the accuracy error is no more than 0.4 on each batch. for batch in true_predicted: @@ -1242,12 +1300,17 @@ def collect_errors(rdd): ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) - t = time() self.ssc.start() - self._ssc_wait(t, 20.0, 0.01) - # Test that the improvement in error is atleast 0.3 - self.assertTrue(errors[1] - errors[-1] > 0.3) + def condition(): + # Test that the improvement in error is > 0.3 + if len(errors) == len(predict_batches): + self.assertGreater(errors[1] - errors[-1], 0.3) + if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: + return True + return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) + + self._eventually(condition) class StreamingLinearRegressionWithTests(MLLibStreamingTestCase): @@ -1274,13 +1337,16 @@ def test_parameter_accuracy(self): batches.append(sc.parallelize(batch)) input_stream = self.ssc.queueStream(batches) - t = time() slr.trainOn(input_stream) self.ssc.start() - self._ssc_wait(t, 10, 0.01) - self.assertArrayAlmostEqual( - slr.latestModel().weights.array, [10., 10.], 1) - self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1) + + def condition(): + self.assertArrayAlmostEqual( + slr.latestModel().weights.array, [10., 10.], 1) + self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1) + return True + + self._eventually(condition, catch_assertions=True) def test_parameter_convergence(self): """Test that the model parameters improve with streaming data.""" @@ -1298,13 +1364,18 @@ def test_parameter_convergence(self): input_stream = self.ssc.queueStream(batches) input_stream.foreachRDD( lambda x: model_weights.append(slr.latestModel().weights[0])) - t = time() slr.trainOn(input_stream) self.ssc.start() - self._ssc_wait(t, 10, 0.01) - model_weights = array(model_weights) - diff = model_weights[1:] - model_weights[:-1] + def condition(): + self.assertEquals(len(model_weights), len(batches)) + return True + + # We want all batches to finish for this test. + self._eventually(condition, catch_assertions=True) + + w = array(model_weights) + diff = w[1:] - w[:-1] self.assertTrue(all(diff >= -0.1)) def test_prediction(self): @@ -1323,13 +1394,18 @@ def test_prediction(self): sc.parallelize(batch).map(lambda lp: (lp.label, lp.features))) input_stream = self.ssc.queueStream(batches) - t = time() output_stream = slr.predictOnValues(input_stream) samples = [] output_stream.foreachRDD(lambda x: samples.append(x.collect())) self.ssc.start() - self._ssc_wait(t, 5, 0.01) + + def condition(): + self.assertEquals(len(samples), len(batches)) + return True + + # We want all batches to finish for this test. + self._eventually(condition, catch_assertions=True) # Test that mean absolute error on each batch is less than 0.1 for batch in samples: @@ -1350,22 +1426,27 @@ def test_train_prediction(self): predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in batches] - mean_absolute_errors = [] + errors = [] def func(rdd): true, predicted = zip(*rdd.collect()) - mean_absolute_errors.append(mean(abs(true) - abs(predicted))) + errors.append(mean(abs(true) - abs(predicted))) - model_weights = [] input_stream = self.ssc.queueStream(batches) output_stream = self.ssc.queueStream(predict_batches) - t = time() slr.trainOn(input_stream) output_stream = slr.predictOnValues(output_stream) output_stream.foreachRDD(func) self.ssc.start() - self._ssc_wait(t, 10, 0.01) - self.assertTrue(mean_absolute_errors[1] - mean_absolute_errors[-1] > 2) + + def condition(): + if len(errors) == len(predict_batches): + self.assertGreater(errors[1] - errors[-1], 2) + if len(errors) >= 3 and errors[1] - errors[-1] > 2: + return True + return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) + + self._eventually(condition) class MLUtilsTests(MLlibTestCase): diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index 916de2d6fcdb..10a1e4b3eb0f 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -300,6 +300,7 @@ def generateLinearInput(intercept, weights, xMean, xVariance, :param: seed Random Seed :param: eps Used to scale the noise. If eps is set high, the amount of gaussian noise added is more. + Returns a list of LabeledPoints of length nPoints """ weights = [float(weight) for weight in weights] diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index fa8e0a0574a6..9ef60a7e2c84 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -700,7 +700,7 @@ def groupBy(self, f, numPartitions=None): return self.map(lambda x: (f(x), x)).groupByKey(numPartitions) @ignore_unicode_prefix - def pipe(self, command, env={}, checkCode=False): + def pipe(self, command, env=None, checkCode=False): """ Return an RDD created by piping elements to a forked external process. @@ -709,6 +709,9 @@ def pipe(self, command, env={}, checkCode=False): :param checkCode: whether or not to check the return value of the shell command. """ + if env is None: + env = dict() + def func(iterator): pipe = Popen( shlex.split(command), env=env, stdin=PIPE, stdout=PIPE) diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index 411b4dbf481f..2a1326947f4f 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -359,6 +359,7 @@ def _hack_namedtuple(cls): def __reduce__(self): return (_restore, (name, fields, tuple(self))) cls.__reduce__ = __reduce__ + cls._is_namedtuple_ = True return cls diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 0a85da7443d3..8af8637cf948 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -16,6 +16,7 @@ # import sys +import warnings if sys.version >= '3': basestring = str @@ -254,12 +255,29 @@ def inSet(self, *cols): [Row(age=5, name=u'Bob')] >>> df[df.age.inSet([1, 2, 3])].collect() [Row(age=2, name=u'Alice')] + + .. note:: Deprecated in 1.5, use :func:`Column.isin` instead. + """ + warnings.warn("inSet is deprecated. Use isin() instead.") + return self.isin(*cols) + + @ignore_unicode_prefix + @since(1.5) + def isin(self, *cols): + """ + A boolean expression that is evaluated to true if the value of this + expression is contained by the evaluated values of the arguments. + + >>> df[df.name.isin("Bob", "Mike")].collect() + [Row(age=5, name=u'Bob')] + >>> df[df.age.isin([1, 2, 3])].collect() + [Row(age=2, name=u'Alice')] """ if len(cols) == 1 and isinstance(cols[0], (list, set)): cols = cols[0] cols = [c._jc if isinstance(c, Column) else _create_column_from_literal(c) for c in cols] sc = SparkContext._active_spark_context - jc = getattr(self._jc, "in")(_to_seq(sc, cols)) + jc = getattr(self._jc, "isin")(_to_seq(sc, cols)) return Column(jc) # order diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 917de24f3536..0ef46c44644a 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -39,7 +39,7 @@ try: import pandas has_pandas = True -except ImportError: +except Exception: has_pandas = False __all__ = ["SQLContext", "HiveContext", "UDFRegistration"] diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 0f3480c23918..025811f51929 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -212,8 +212,7 @@ def explain(self, extended=False): :param extended: boolean, default ``False``. If ``False``, prints only the physical plan. >>> df.explain() - PhysicalRDD [age#0,name#1], MapPartitionsRDD[...] at applySchemaToPythonRDD at\ - NativeMethodAccessorImpl.java:... + Scan PhysicalRDD[age#0,name#1] >>> df.explain(True) == Parsed Logical Plan == @@ -224,7 +223,6 @@ def explain(self, extended=False): ... == Physical Plan == ... - == RDD == """ if extended: print(self._jdf.queryExecution().toString()) @@ -568,8 +566,7 @@ def join(self, other, on=None, how=None): if on is None or len(on) == 0: jdf = self._jdf.join(other._jdf) - - if isinstance(on[0], basestring): + elif isinstance(on[0], basestring): jdf = self._jdf.join(other._jdf, self._jseq(on)) else: assert isinstance(on[0], Column), "on should be Column or list of Column" @@ -725,8 +722,6 @@ def __getitem__(self, item): [Row(age=5, name=u'Bob')] """ if isinstance(item, basestring): - if item not in self.columns: - raise IndexError("no such column: %s" % item) jc = self._jdf.apply(item) return Column(jc) elif isinstance(item, Column): @@ -1207,7 +1202,9 @@ def freqItems(self, cols, support=None): @ignore_unicode_prefix @since(1.3) def withColumn(self, colName, col): - """Returns a new :class:`DataFrame` by adding a column. + """ + Returns a new :class:`DataFrame` by adding a column or replacing the + existing column that has the same name. :param colName: string, name of the new column. :param col: a :class:`Column` expression for the new column. @@ -1215,7 +1212,8 @@ def withColumn(self, colName, col): >>> df.withColumn('age2', df.age + 2).collect() [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)] """ - return self.select('*', col.alias(colName)) + assert isinstance(col, Column), "col should be Column" + return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx) @ignore_unicode_prefix @since(1.3) @@ -1228,10 +1226,7 @@ def withColumnRenamed(self, existing, new): >>> df.withColumnRenamed('age', 'age2').collect() [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')] """ - cols = [Column(_to_java_column(c)).alias(new) - if c == existing else c - for c in self.columns] - return self.select(*cols) + return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sql_ctx) @since(1.4) @ignore_unicode_prefix diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index a73ecc7d9336..4b74a501521a 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -32,41 +32,6 @@ from pyspark.sql.column import Column, _to_java_column, _to_seq -__all__ = [ - 'array', - 'approxCountDistinct', - 'bin', - 'coalesce', - 'countDistinct', - 'explode', - 'format_number', - 'length', - 'log2', - 'md5', - 'monotonicallyIncreasingId', - 'rand', - 'randn', - 'regexp_extract', - 'regexp_replace', - 'sha1', - 'sha2', - 'size', - 'sort_array', - 'sparkPartitionId', - 'struct', - 'udf', - 'when'] - -__all__ += ['lag', 'lead', 'ntile'] - -__all__ += [ - 'date_format', 'date_add', 'date_sub', 'add_months', 'months_between', - 'year', 'quarter', 'month', 'hour', 'minute', 'second', - 'dayofmonth', 'dayofyear', 'weekofyear'] - -__all__ += ['soundex', 'substring', 'substring_index'] - - def _create_function(name, doc=""): """ Create a function for aggregator by name""" def _(col): @@ -208,30 +173,6 @@ def _(): for _name, _doc in _window_functions.items(): globals()[_name] = since(1.4)(_create_window_function(_name, _doc)) del _name, _doc -__all__ += _functions.keys() -__all__ += _functions_1_4.keys() -__all__ += _binary_mathfunctions.keys() -__all__ += _window_functions.keys() -__all__.sort() - - -@since(1.4) -def array(*cols): - """Creates a new array column. - - :param cols: list of column names (string) or list of :class:`Column` expressions that have - the same data type. - - >>> df.select(array('age', 'age').alias("arr")).collect() - [Row(arr=[2, 2]), Row(arr=[5, 5])] - >>> df.select(array([df.age, df.age]).alias("arr")).collect() - [Row(arr=[2, 2]), Row(arr=[5, 5])] - """ - sc = SparkContext._active_spark_context - if len(cols) == 1 and isinstance(cols[0], (list, set)): - cols = cols[0] - jc = sc._jvm.functions.array(_to_seq(sc, cols, _to_java_column)) - return Column(jc) @since(1.3) @@ -249,19 +190,6 @@ def approxCountDistinct(col, rsd=None): return Column(jc) -@ignore_unicode_prefix -@since(1.5) -def bin(col): - """Returns the string representation of the binary value of the given column. - - >>> df.select(bin(df.age).alias('c')).collect() - [Row(c=u'10'), Row(c=u'101')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.bin(_to_java_column(col)) - return Column(jc) - - @since(1.4) def coalesce(*cols): """Returns the first column that is not null. @@ -314,82 +242,6 @@ def countDistinct(col, *cols): return Column(jc) -@since(1.4) -def explode(col): - """Returns a new row for each element in the given array or map. - - >>> from pyspark.sql import Row - >>> eDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) - >>> eDF.select(explode(eDF.intlist).alias("anInt")).collect() - [Row(anInt=1), Row(anInt=2), Row(anInt=3)] - - >>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show() - +---+-----+ - |key|value| - +---+-----+ - | a| b| - +---+-----+ - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.explode(_to_java_column(col)) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def levenshtein(left, right): - """Computes the Levenshtein distance of the two given strings. - - >>> df0 = sqlContext.createDataFrame([('kitten', 'sitting',)], ['l', 'r']) - >>> df0.select(levenshtein('l', 'r').alias('d')).collect() - [Row(d=3)] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.levenshtein(_to_java_column(left), _to_java_column(right)) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def regexp_extract(str, pattern, idx): - """Extract a specific(idx) group identified by a java regex, from the specified string column. - - >>> df = sqlContext.createDataFrame([('100-200',)], ['str']) - >>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect() - [Row(d=u'100')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def regexp_replace(str, pattern, replacement): - """Replace all substrings of the specified string value that match regexp with rep. - - >>> df = sqlContext.createDataFrame([('100-200',)], ['str']) - >>> df.select(regexp_replace('str', '(\\d+)', '##').alias('d')).collect() - [Row(d=u'##-##')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def md5(col): - """Calculates the MD5 digest and returns the value as a 32 character hex string. - - >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect() - [Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.md5(_to_java_column(col)) - return Column(jc) - - @since(1.4) def monotonicallyIncreasingId(): """A column that generates monotonically increasing 64-bit integers. @@ -416,7 +268,7 @@ def rand(seed=None): """Generates a random column with i.i.d. samples from U[0.0, 1.0]. """ sc = SparkContext._active_spark_context - if seed: + if seed is not None: jc = sc._jvm.functions.rand(seed) else: jc = sc._jvm.functions.rand() @@ -428,70 +280,24 @@ def randn(seed=None): """Generates a column with i.i.d. samples from the standard normal distribution. """ sc = SparkContext._active_spark_context - if seed: + if seed is not None: jc = sc._jvm.functions.randn(seed) else: jc = sc._jvm.functions.randn() return Column(jc) -@ignore_unicode_prefix -@since(1.5) -def hex(col): - """Computes hex value of the given column, which could be StringType, - BinaryType, IntegerType or LongType. - - >>> sqlContext.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect() - [Row(hex(a)=u'414243', hex(b)=u'3')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.hex(_to_java_column(col)) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def unhex(col): - """Inverse of hex. Interprets each pair of characters as a hexadecimal number - and converts to the byte representation of number. - - >>> sqlContext.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect() - [Row(unhex(a)=bytearray(b'ABC'))] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.unhex(_to_java_column(col)) - return Column(jc) - - -@ignore_unicode_prefix @since(1.5) -def sha1(col): - """Returns the hex string result of SHA-1. - - >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect() - [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')] +def round(col, scale=0): """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.sha1(_to_java_column(col)) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def sha2(col, numBits): - """Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384, - and SHA-512). The numBits indicates the desired bit length of the result, which must have a - value of 224, 256, 384, 512, or 0 (which is equivalent to 256). + Round the value of `e` to `scale` decimal places if `scale` >= 0 + or at integral part when `scale` < 0. - >>> digests = df.select(sha2(df.name, 256).alias('s')).collect() - >>> digests[0] - Row(s=u'3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043') - >>> digests[1] - Row(s=u'cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961') + >>> sqlContext.createDataFrame([(2.546,)], ['a']).select(round('a', 1).alias('r')).collect() + [Row(r=2.5)] """ sc = SparkContext._active_spark_context - jc = sc._jvm.functions.sha2(_to_java_column(col), numBits) - return Column(jc) + return Column(sc._jvm.functions.round(_to_java_column(col), scale)) @since(1.5) @@ -502,8 +308,7 @@ def shiftLeft(col, numBits): [Row(r=42)] """ sc = SparkContext._active_spark_context - jc = sc._jvm.functions.shiftLeft(_to_java_column(col), numBits) - return Column(jc) + return Column(sc._jvm.functions.shiftLeft(_to_java_column(col), numBits)) @since(1.5) @@ -522,8 +327,8 @@ def shiftRight(col, numBits): def shiftRightUnsigned(col, numBits): """Unsigned shift the the given value numBits right. - >>> sqlContext.createDataFrame([(-42,)], ['a']).select(shiftRightUnsigned('a', 1).alias('r'))\ - .collect() + >>> df = sqlContext.createDataFrame([(-42,)], ['a']) + >>> df.select(shiftRightUnsigned('a', 1).alias('r')).collect() [Row(r=9223372036854775787)] """ sc = SparkContext._active_spark_context @@ -544,6 +349,7 @@ def sparkPartitionId(): return Column(sc._jvm.functions.sparkPartitionId()) +@since(1.5) def expr(str): """Parses the expression string into the column that it represents @@ -554,34 +360,6 @@ def expr(str): return Column(sc._jvm.functions.expr(str)) -@ignore_unicode_prefix -@since(1.5) -def length(col): - """Calculates the length of a string or binary expression. - - >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(length('a').alias('length')).collect() - [Row(length=3)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.length(_to_java_column(col))) - - -@ignore_unicode_prefix -@since(1.5) -def format_number(col, d): - """Formats the number X to a format like '#,###,###.##', rounded to d decimal places, - and returns the result as a string. - - :param col: the column name of the numeric value to be formatted - :param d: the N decimal places - - >>> sqlContext.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect() - [Row(v=u'5.0000')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.format_number(_to_java_column(col), d)) - - @ignore_unicode_prefix @since(1.4) def struct(*cols): @@ -601,6 +379,38 @@ def struct(*cols): return Column(jc) +@since(1.5) +def greatest(*cols): + """ + Returns the greatest value of the list of column names, skipping null values. + This function takes at least 2 parameters. It will return null iff all parameters are null. + + >>> df = sqlContext.createDataFrame([(1, 4, 3)], ['a', 'b', 'c']) + >>> df.select(greatest(df.a, df.b, df.c).alias("greatest")).collect() + [Row(greatest=4)] + """ + if len(cols) < 2: + raise ValueError("greatest should take at least two columns") + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.greatest(_to_seq(sc, cols, _to_java_column))) + + +@since(1.5) +def least(*cols): + """ + Returns the least value of the list of column names, skipping null values. + This function takes at least 2 parameters. It will return null iff all parameters are null. + + >>> df = sqlContext.createDataFrame([(1, 4, 3)], ['a', 'b', 'c']) + >>> df.select(least(df.a, df.b, df.c).alias("least")).collect() + [Row(least=1)] + """ + if len(cols) < 2: + raise ValueError("least should take at least two columns") + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.least(_to_seq(sc, cols, _to_java_column))) + + @since(1.4) def when(condition, value): """Evaluates a list of conditions and returns one of multiple possible result expressions. @@ -654,6 +464,35 @@ def log2(col): return Column(sc._jvm.functions.log2(_to_java_column(col))) +@since(1.5) +@ignore_unicode_prefix +def conv(col, fromBase, toBase): + """ + Convert a number in a string column from one base to another. + + >>> df = sqlContext.createDataFrame([("010101",)], ['n']) + >>> df.select(conv(df.n, 2, 16).alias('hex')).collect() + [Row(hex=u'15')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.conv(_to_java_column(col), fromBase, toBase)) + + +@since(1.5) +def factorial(col): + """ + Computes the factorial of the given value. + + >>> df = sqlContext.createDataFrame([(5,)], ['n']) + >>> df.select(factorial(df.n).alias('f')).collect() + [Row(f=120)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.factorial(_to_java_column(col))) + + +# --------------- Window functions ------------------------ + @since(1.4) def lag(col, count=1, default=None): """ @@ -691,9 +530,10 @@ def lead(col, count=1, default=None): @since(1.4) def ntile(n): """ - Window function: returns a group id from 1 to `n` (inclusive) in a round-robin fashion in - a window partition. Fow example, if `n` is 3, the first row will get 1, the second row will - get 2, the third row will get 3, and the fourth row will get 1... + Window function: returns the ntile group id (from 1 to `n` inclusive) + in an ordered window partition. For example, if `n` is 4, the first + quarter of the rows will get value 1, the second quarter will get 2, + the third quarter will get 3, and the last quarter will get 4. This is equivalent to the NTILE function in SQL. @@ -703,9 +543,28 @@ def ntile(n): return Column(sc._jvm.functions.ntile(int(n))) +# ---------------------- Date/Timestamp functions ------------------------------ + +@since(1.5) +def current_date(): + """ + Returns the current date as a date column. + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.current_date()) + + +def current_timestamp(): + """ + Returns the current timestamp as a timestamp column. + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.current_timestamp()) + + @ignore_unicode_prefix @since(1.5) -def date_format(dateCol, format): +def date_format(date, format): """ Converts a date/timestamp/string to a value of string in the format specified by the date format given by the second argument. @@ -721,7 +580,7 @@ def date_format(dateCol, format): [Row(date=u'04/08/2015')] """ sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.date_format(_to_java_column(dateCol), format)) + return Column(sc._jvm.functions.date_format(_to_java_column(date), format)) @since(1.5) @@ -867,6 +726,19 @@ def date_sub(start, days): return Column(sc._jvm.functions.date_sub(_to_java_column(start), days)) +@since(1.5) +def datediff(end, start): + """ + Returns the number of days from `start` to `end`. + + >>> df = sqlContext.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2']) + >>> df.select(datediff(df.d2, df.d1).alias('diff')).collect() + [Row(diff=32)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.datediff(_to_java_column(end), _to_java_column(start))) + + @since(1.5) def add_months(start, months): """ @@ -924,33 +796,296 @@ def trunc(date, format): @since(1.5) -@ignore_unicode_prefix -def substring(str, pos, len): +def next_day(date, dayOfWeek): """ - Substring starts at `pos` and is of length `len` when str is String type or - returns the slice of byte array that starts at `pos` in byte and is of length `len` - when str is Binary type + Returns the first date which is later than the value of the date column. - >>> df = sqlContext.createDataFrame([('abcd',)], ['s',]) - >>> df.select(substring(df.s, 1, 2).alias('s')).collect() - [Row(s=u'ab')] + Day of the week parameter is case insensitive, and accepts: + "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun". + + >>> df = sqlContext.createDataFrame([('2015-07-27',)], ['d']) + >>> df.select(next_day(df.d, 'Sun').alias('date')).collect() + [Row(date=datetime.date(2015, 8, 2))] """ sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.substring(_to_java_column(str), pos, len)) + return Column(sc._jvm.functions.next_day(_to_java_column(date), dayOfWeek)) @since(1.5) -@ignore_unicode_prefix -def substring_index(str, delim, count): +def last_day(date): """ - Returns the substring from string str before count occurrences of the delimiter delim. - If count is positive, everything the left of the final delimiter (counting from left) is - returned. If count is negative, every to the right of the final delimiter (counting from the - right) is returned. substring_index performs a case-sensitive match when searching for delim. + Returns the last day of the month which the given date belongs to. - >>> df = sqlContext.createDataFrame([('a.b.c.d',)], ['s']) - >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect() - [Row(s=u'a.b')] + >>> df = sqlContext.createDataFrame([('1997-02-10',)], ['d']) + >>> df.select(last_day(df.d).alias('date')).collect() + [Row(date=datetime.date(1997, 2, 28))] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.last_day(_to_java_column(date))) + + +@since(1.5) +def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"): + """ + Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string + representing the timestamp of that moment in the current system time zone in the given + format. + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.from_unixtime(_to_java_column(timestamp), format)) + + +@since(1.5) +def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'): + """ + Convert time string with given pattern ('yyyy-MM-dd HH:mm:ss', by default) + to Unix time stamp (in seconds), using the default timezone and the default + locale, return null if fail. + + if `timestamp` is None, then it returns current timestamp. + """ + sc = SparkContext._active_spark_context + if timestamp is None: + return Column(sc._jvm.functions.unix_timestamp()) + return Column(sc._jvm.functions.unix_timestamp(_to_java_column(timestamp), format)) + + +@since(1.5) +def from_utc_timestamp(timestamp, tz): + """ + Assumes given timestamp is UTC and converts to given timezone. + + >>> df = sqlContext.createDataFrame([('1997-02-28 10:30:00',)], ['t']) + >>> df.select(from_utc_timestamp(df.t, "PST").alias('t')).collect() + [Row(t=datetime.datetime(1997, 2, 28, 2, 30))] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.from_utc_timestamp(_to_java_column(timestamp), tz)) + + +@since(1.5) +def to_utc_timestamp(timestamp, tz): + """ + Assumes given timestamp is in given timezone and converts to UTC. + + >>> df = sqlContext.createDataFrame([('1997-02-28 10:30:00',)], ['t']) + >>> df.select(to_utc_timestamp(df.t, "PST").alias('t')).collect() + [Row(t=datetime.datetime(1997, 2, 28, 18, 30))] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.to_utc_timestamp(_to_java_column(timestamp), tz)) + + +# ---------------------------- misc functions ---------------------------------- + +@since(1.5) +@ignore_unicode_prefix +def crc32(col): + """ + Calculates the cyclic redundancy check value (CRC32) of a binary column and + returns the value as a bigint. + + >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(crc32('a').alias('crc32')).collect() + [Row(crc32=2743272264)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.crc32(_to_java_column(col))) + + +@ignore_unicode_prefix +@since(1.5) +def md5(col): + """Calculates the MD5 digest and returns the value as a 32 character hex string. + + >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect() + [Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.md5(_to_java_column(col)) + return Column(jc) + + +@ignore_unicode_prefix +@since(1.5) +def sha1(col): + """Returns the hex string result of SHA-1. + + >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect() + [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.sha1(_to_java_column(col)) + return Column(jc) + + +@ignore_unicode_prefix +@since(1.5) +def sha2(col, numBits): + """Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384, + and SHA-512). The numBits indicates the desired bit length of the result, which must have a + value of 224, 256, 384, 512, or 0 (which is equivalent to 256). + + >>> digests = df.select(sha2(df.name, 256).alias('s')).collect() + >>> digests[0] + Row(s=u'3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043') + >>> digests[1] + Row(s=u'cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961') + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.sha2(_to_java_column(col), numBits) + return Column(jc) + + +# ---------------------- String/Binary functions ------------------------------ + +_string_functions = { + 'ascii': 'Computes the numeric value of the first character of the string column.', + 'base64': 'Computes the BASE64 encoding of a binary column and returns it as a string column.', + 'unbase64': 'Decodes a BASE64 encoded string column and returns it as a binary column.', + 'initcap': 'Returns a new string column by converting the first letter of each word to ' + + 'uppercase. Words are delimited by whitespace.', + 'lower': 'Converts a string column to lower case.', + 'upper': 'Converts a string column to upper case.', + 'reverse': 'Reverses the string column and returns it as a new string column.', + 'ltrim': 'Trim the spaces from right end for the specified string value.', + 'rtrim': 'Trim the spaces from right end for the specified string value.', + 'trim': 'Trim the spaces from both ends for the specified string column.', +} + + +for _name, _doc in _string_functions.items(): + globals()[_name] = since(1.5)(_create_function(_name, _doc)) +del _name, _doc + + +@since(1.5) +@ignore_unicode_prefix +def concat(*cols): + """ + Concatenates multiple input string columns together into a single string column. + + >>> df = sqlContext.createDataFrame([('abcd','123')], ['s', 'd']) + >>> df.select(concat(df.s, df.d).alias('s')).collect() + [Row(s=u'abcd123')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.concat(_to_seq(sc, cols, _to_java_column))) + + +@since(1.5) +@ignore_unicode_prefix +def concat_ws(sep, *cols): + """ + Concatenates multiple input string columns together into a single string column, + using the given separator. + + >>> df = sqlContext.createDataFrame([('abcd','123')], ['s', 'd']) + >>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect() + [Row(s=u'abcd-123')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.concat_ws(sep, _to_seq(sc, cols, _to_java_column))) + + +@since(1.5) +def decode(col, charset): + """ + Computes the first argument into a string from a binary using the provided character set + (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.decode(_to_java_column(col), charset)) + + +@since(1.5) +def encode(col, charset): + """ + Computes the first argument into a binary from a string using the provided character set + (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.encode(_to_java_column(col), charset)) + + +@ignore_unicode_prefix +@since(1.5) +def format_number(col, d): + """ + Formats the number X to a format like '#,--#,--#.--', rounded to d decimal places, + and returns the result as a string. + + :param col: the column name of the numeric value to be formatted + :param d: the N decimal places + + >>> sqlContext.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect() + [Row(v=u'5.0000')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.format_number(_to_java_column(col), d)) + + +@ignore_unicode_prefix +@since(1.5) +def format_string(format, *cols): + """ + Formats the arguments in printf-style and returns the result as a string column. + + :param col: the column name of the numeric value to be formatted + :param d: the N decimal places + + >>> df = sqlContext.createDataFrame([(5, "hello")], ['a', 'b']) + >>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect() + [Row(v=u'5 hello')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.format_string(format, _to_seq(sc, cols, _to_java_column))) + + +@since(1.5) +def instr(str, substr): + """ + Locate the position of the first occurrence of substr column in the given string. + Returns null if either of the arguments are null. + + NOTE: The position is not zero based, but 1 based index, returns 0 if substr + could not be found in str. + + >>> df = sqlContext.createDataFrame([('abcd',)], ['s',]) + >>> df.select(instr(df.s, 'b').alias('s')).collect() + [Row(s=2)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.instr(_to_java_column(str), substr)) + + +@since(1.5) +@ignore_unicode_prefix +def substring(str, pos, len): + """ + Substring starts at `pos` and is of length `len` when str is String type or + returns the slice of byte array that starts at `pos` in byte and is of length `len` + when str is Binary type + + >>> df = sqlContext.createDataFrame([('abcd',)], ['s',]) + >>> df.select(substring(df.s, 1, 2).alias('s')).collect() + [Row(s=u'ab')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.substring(_to_java_column(str), pos, len)) + + +@since(1.5) +@ignore_unicode_prefix +def substring_index(str, delim, count): + """ + Returns the substring from string str before count occurrences of the delimiter delim. + If count is positive, everything the left of the final delimiter (counting from left) is + returned. If count is negative, every to the right of the final delimiter (counting from the + right) is returned. substring_index performs a case-sensitive match when searching for delim. + + >>> df = sqlContext.createDataFrame([('a.b.c.d',)], ['s']) + >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect() + [Row(s=u'a.b')] >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect() [Row(s=u'b.c.d')] """ @@ -958,6 +1093,126 @@ def substring_index(str, delim, count): return Column(sc._jvm.functions.substring_index(_to_java_column(str), delim, count)) +@ignore_unicode_prefix +@since(1.5) +def levenshtein(left, right): + """Computes the Levenshtein distance of the two given strings. + + >>> df0 = sqlContext.createDataFrame([('kitten', 'sitting',)], ['l', 'r']) + >>> df0.select(levenshtein('l', 'r').alias('d')).collect() + [Row(d=3)] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.levenshtein(_to_java_column(left), _to_java_column(right)) + return Column(jc) + + +@since(1.5) +def locate(substr, str, pos=0): + """ + Locate the position of the first occurrence of substr in a string column, after position pos. + + NOTE: The position is not zero based, but 1 based index. returns 0 if substr + could not be found in str. + + :param substr: a string + :param str: a Column of StringType + :param pos: start position (zero based) + + >>> df = sqlContext.createDataFrame([('abcd',)], ['s',]) + >>> df.select(locate('b', df.s, 1).alias('s')).collect() + [Row(s=2)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.locate(substr, _to_java_column(str), pos)) + + +@since(1.5) +@ignore_unicode_prefix +def lpad(col, len, pad): + """ + Left-pad the string column to width `len` with `pad`. + + >>> df = sqlContext.createDataFrame([('abcd',)], ['s',]) + >>> df.select(lpad(df.s, 6, '#').alias('s')).collect() + [Row(s=u'##abcd')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.lpad(_to_java_column(col), len, pad)) + + +@since(1.5) +@ignore_unicode_prefix +def rpad(col, len, pad): + """ + Right-pad the string column to width `len` with `pad`. + + >>> df = sqlContext.createDataFrame([('abcd',)], ['s',]) + >>> df.select(rpad(df.s, 6, '#').alias('s')).collect() + [Row(s=u'abcd##')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.rpad(_to_java_column(col), len, pad)) + + +@since(1.5) +@ignore_unicode_prefix +def repeat(col, n): + """ + Repeats a string column n times, and returns it as a new string column. + + >>> df = sqlContext.createDataFrame([('ab',)], ['s',]) + >>> df.select(repeat(df.s, 3).alias('s')).collect() + [Row(s=u'ababab')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.repeat(_to_java_column(col), n)) + + +@since(1.5) +@ignore_unicode_prefix +def split(str, pattern): + """ + Splits str around pattern (pattern is a regular expression). + + NOTE: pattern is a string represent the regular expression. + + >>> df = sqlContext.createDataFrame([('ab12cd',)], ['s',]) + >>> df.select(split(df.s, '[0-9]+').alias('s')).collect() + [Row(s=[u'ab', u'cd'])] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.split(_to_java_column(str), pattern)) + + +@ignore_unicode_prefix +@since(1.5) +def regexp_extract(str, pattern, idx): + """Extract a specific(idx) group identified by a java regex, from the specified string column. + + >>> df = sqlContext.createDataFrame([('100-200',)], ['str']) + >>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect() + [Row(d=u'100')] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx) + return Column(jc) + + +@ignore_unicode_prefix +@since(1.5) +def regexp_replace(str, pattern, replacement): + """Replace all substrings of the specified string value that match regexp with rep. + + >>> df = sqlContext.createDataFrame([('100-200',)], ['str']) + >>> df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect() + [Row(d=u'-----')] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement) + return Column(jc) + + @ignore_unicode_prefix @since(1.5) def initcap(col): @@ -970,6 +1225,147 @@ def initcap(col): return Column(sc._jvm.functions.initcap(_to_java_column(col))) +@since(1.5) +@ignore_unicode_prefix +def soundex(col): + """ + Returns the SoundEx encoding for a string + + >>> df = sqlContext.createDataFrame([("Peters",),("Uhrbach",)], ['name']) + >>> df.select(soundex(df.name).alias("soundex")).collect() + [Row(soundex=u'P362'), Row(soundex=u'U612')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.soundex(_to_java_column(col))) + + +@ignore_unicode_prefix +@since(1.5) +def bin(col): + """Returns the string representation of the binary value of the given column. + + >>> df.select(bin(df.age).alias('c')).collect() + [Row(c=u'10'), Row(c=u'101')] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.bin(_to_java_column(col)) + return Column(jc) + + +@ignore_unicode_prefix +@since(1.5) +def hex(col): + """Computes hex value of the given column, which could be StringType, + BinaryType, IntegerType or LongType. + + >>> sqlContext.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect() + [Row(hex(a)=u'414243', hex(b)=u'3')] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.hex(_to_java_column(col)) + return Column(jc) + + +@ignore_unicode_prefix +@since(1.5) +def unhex(col): + """Inverse of hex. Interprets each pair of characters as a hexadecimal number + and converts to the byte representation of number. + + >>> sqlContext.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect() + [Row(unhex(a)=bytearray(b'ABC'))] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.unhex(_to_java_column(col))) + + +@ignore_unicode_prefix +@since(1.5) +def length(col): + """Calculates the length of a string or binary expression. + + >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(length('a').alias('length')).collect() + [Row(length=3)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.length(_to_java_column(col))) + + +@ignore_unicode_prefix +@since(1.5) +def translate(srcCol, matching, replace): + """A function translate any character in the `srcCol` by a character in `matching`. + The characters in `replace` is corresponding to the characters in `matching`. + The translate will happen when any character in the string matching with the character + in the `matching`. + + >>> sqlContext.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123")\ + .alias('r')).collect() + [Row(r=u'1a2s3ae')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.translate(_to_java_column(srcCol), matching, replace)) + + +# ---------------------- Collection functions ------------------------------ + +@since(1.4) +def array(*cols): + """Creates a new array column. + + :param cols: list of column names (string) or list of :class:`Column` expressions that have + the same data type. + + >>> df.select(array('age', 'age').alias("arr")).collect() + [Row(arr=[2, 2]), Row(arr=[5, 5])] + >>> df.select(array([df.age, df.age]).alias("arr")).collect() + [Row(arr=[2, 2]), Row(arr=[5, 5])] + """ + sc = SparkContext._active_spark_context + if len(cols) == 1 and isinstance(cols[0], (list, set)): + cols = cols[0] + jc = sc._jvm.functions.array(_to_seq(sc, cols, _to_java_column)) + return Column(jc) + + +@since(1.5) +def array_contains(col, value): + """ + Collection function: returns True if the array contains the given value. The collection + elements and value must be of the same type. + + :param col: name of column containing array + :param value: value to check for in array + + >>> df = sqlContext.createDataFrame([(["a", "b", "c"],), ([],)], ['data']) + >>> df.select(array_contains(df.data, "a")).collect() + [Row(array_contains(data,a)=True), Row(array_contains(data,a)=False)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.array_contains(_to_java_column(col), value)) + + +@since(1.4) +def explode(col): + """Returns a new row for each element in the given array or map. + + >>> from pyspark.sql import Row + >>> eDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) + >>> eDF.select(explode(eDF.intlist).alias("anInt")).collect() + [Row(anInt=1), Row(anInt=2), Row(anInt=3)] + + >>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show() + +---+-----+ + |key|value| + +---+-----+ + | a| b| + +---+-----+ + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.explode(_to_java_column(col)) + return Column(jc) + + @since(1.5) def size(col): """ @@ -1002,19 +1398,7 @@ def sort_array(col, asc=True): return Column(sc._jvm.functions.sort_array(_to_java_column(col), asc)) -@since -@ignore_unicode_prefix -def soundex(col): - """ - Returns the SoundEx encoding for a string - - >>> df = sqlContext.createDataFrame([("Peters",),("Uhrbach",)], ['name']) - >>> df.select(soundex(df.name).alias("soundex")).collect() - [Row(soundex=u'P362'), Row(soundex=u'U612')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.size(_to_java_column(col))) - +# ---------------------------- User Defined Function ---------------------------------- class UserDefinedFunction(object): """ @@ -1066,6 +1450,11 @@ def udf(f, returnType=StringType()): """ return UserDefinedFunction(f, returnType) +blacklist = ['map', 'since', 'ignore_unicode_prefix'] +__all__ = [k for k, v in globals().items() + if not k.startswith('_') and k[0].islower() and callable(v) and k not in blacklist] +__all__.sort() + def _test(): import doctest diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index dea8bad79e18..78247c8fa737 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -24,6 +24,16 @@ __all__ = ["DataFrameReader", "DataFrameWriter"] +def to_str(value): + """ + A wrapper over str(), but convert bool values to lower case string + """ + if isinstance(value, bool): + return str(value).lower() + else: + return str(value) + + class DataFrameReader(object): """ Interface used to load a :class:`DataFrame` from external storage systems @@ -77,7 +87,7 @@ def schema(self, schema): def option(self, key, value): """Adds an input option for the underlying data source. """ - self._jreader = self._jreader.option(key, value) + self._jreader = self._jreader.option(key, to_str(value)) return self @since(1.4) @@ -85,7 +95,7 @@ def options(self, **options): """Adds input options for the underlying data source. """ for k in options: - self._jreader = self._jreader.option(k, options[k]) + self._jreader = self._jreader.option(k, to_str(options[k])) return self @since(1.4) @@ -97,7 +107,8 @@ def load(self, path=None, format=None, schema=None, **options): :param schema: optional :class:`StructType` for the input schema. :param options: all other string options - >>> df = sqlContext.read.load('python/test_support/sql/parquet_partitioned') + >>> df = sqlContext.read.load('python/test_support/sql/parquet_partitioned', opt1=True, + ... opt2=1, opt3='str') >>> df.dtypes [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')] """ @@ -171,7 +182,7 @@ def orc(self, path): @since(1.4) def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None, - predicates=None, properties={}): + predicates=None, properties=None): """ Construct a :class:`DataFrame` representing the database table accessible via JDBC URL `url` named `table` and connection `properties`. @@ -197,6 +208,8 @@ def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPar should be included. :return: a DataFrame """ + if properties is None: + properties = dict() jprop = JavaClass("java.util.Properties", self._sqlContext._sc._gateway._gateway_client)() for k in properties: jprop.setProperty(k, properties[k]) @@ -416,7 +429,7 @@ def orc(self, path, mode=None, partitionBy=None): self._jwrite.orc(path) @since(1.4) - def jdbc(self, url, table, mode=None, properties={}): + def jdbc(self, url, table, mode=None, properties=None): """Saves the content of the :class:`DataFrame` to a external database table via JDBC. .. note:: Don't create too many partitions in parallel on a large cluster;\ @@ -434,6 +447,8 @@ def jdbc(self, url, table, mode=None, properties={}): arbitrary string tag/value. Normally at least a "user" and "password" property should be included. """ + if properties is None: + properties = dict() jprop = JavaClass("java.util.Properties", self._sqlContext._sc._gateway._gateway_client)() for k in properties: jprop.setProperty(k, properties[k]) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index ebd3ea8db6a4..6b647f3aacfa 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -145,6 +145,12 @@ class PythonOnlyPoint(ExamplePoint): __UDT__ = PythonOnlyUDT() +class MyObject(object): + def __init__(self, key, value): + self.key = key + self.value = value + + class DataTypeTests(unittest.TestCase): # regression test for SPARK-6055 def test_data_type_eq(self): @@ -161,6 +167,11 @@ def test_decimal_type(self): t3 = DecimalType(8) self.assertNotEqual(t2, t3) + # regression test for SPARK-10392 + def test_datetype_equal_zero(self): + dt = DateType() + self.assertEqual(dt.fromInternal(0), datetime.date(1970, 1, 1)) + class SQLTests(ReusedPySparkTestCase): @@ -179,6 +190,21 @@ def tearDownClass(cls): ReusedPySparkTestCase.tearDownClass() shutil.rmtree(cls.tempdir.name, ignore_errors=True) + def test_row_should_be_read_only(self): + row = Row(a=1, b=2) + self.assertEqual(1, row.a) + + def foo(): + row.a = 3 + self.assertRaises(Exception, foo) + + row2 = self.sqlCtx.range(10).first() + self.assertEqual(0, row2.id) + + def foo2(): + row2.id = 2 + self.assertRaises(Exception, foo2) + def test_range(self): self.assertEqual(self.sqlCtx.range(1, 1).count(), 0) self.assertEqual(self.sqlCtx.range(1, 0, -1).count(), 1) @@ -368,6 +394,12 @@ def test_infer_nested_schema(self): df = self.sqlCtx.inferSchema(rdd) self.assertEquals(Row(field1=1, field2=u'row1'), df.first()) + def test_create_dataframe_from_objects(self): + data = [MyObject(1, "1"), MyObject(2, "2")] + df = self.sqlCtx.createDataFrame(data) + self.assertEqual(df.dtypes, [("key", "bigint"), ("value", "string")]) + self.assertEqual(df.first(), Row(key=1, value="1")) + def test_select_null_literal(self): df = self.sqlCtx.sql("select null as col") self.assertEquals(Row(col=None), df.first()) @@ -629,6 +661,16 @@ def test_rand_functions(self): for row in rndn: assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1] + # If the specified seed is 0, we should use it. + # https://issues.apache.org/jira/browse/SPARK-9691 + rnd1 = df.select('key', functions.rand(0)).collect() + rnd2 = df.select('key', functions.rand(0)).collect() + self.assertEqual(sorted(rnd1), sorted(rnd2)) + + rndn1 = df.select('key', functions.randn(0)).collect() + rndn2 = df.select('key', functions.randn(0)).collect() + self.assertEqual(sorted(rndn1), sorted(rndn2)) + def test_between_function(self): df = self.sc.parallelize([ Row(a=1, b=2, c=3), @@ -745,7 +787,7 @@ def test_access_column(self): self.assertTrue(isinstance(df['key'], Column)) self.assertTrue(isinstance(df[0], Column)) self.assertRaises(IndexError, lambda: df[2]) - self.assertRaises(IndexError, lambda: df["bad_key"]) + self.assertRaises(AnalysisException, lambda: df["bad_key"]) self.assertRaises(TypeError, lambda: df[{}]) def test_column_name_with_non_ascii(self): @@ -769,7 +811,9 @@ def test_field_accessor(self): df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF() self.assertEqual(1, df.select(df.l[0]).first()[0]) self.assertEqual(1, df.select(df.r["a"]).first()[0]) + self.assertEqual(1, df.select(df["r.a"]).first()[0]) self.assertEqual("b", df.select(df.r["b"]).first()[0]) + self.assertEqual("b", df.select(df["r.b"]).first()[0]) self.assertEqual("v", df.select(df.d["k"]).first()[0]) def test_infer_long_type(self): @@ -1008,6 +1052,10 @@ def test_capture_illegalargument_exception(self): self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values", lambda: df.select(sha2(df.a, 1024)).collect()) + def test_with_column_with_existing_name(self): + keys = self.df.withColumn("key", self.df.key).select("key").collect() + self.assertEqual([r.key for r in keys], list(range(100))) + class HiveContextSQLTests(ReusedPySparkTestCase): @@ -1099,5 +1147,28 @@ def test_window_functions(self): for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)]) + def test_window_functions_without_partitionBy(self): + df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) + w = Window.orderBy("key", df.value) + from pyspark.sql import functions as F + sel = df.select(df.value, df.key, + F.max("key").over(w.rowsBetween(0, 1)), + F.min("key").over(w.rowsBetween(0, 1)), + F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))), + F.rowNumber().over(w), + F.rank().over(w), + F.denseRank().over(w), + F.ntile(2).over(w)) + rs = sorted(sel.collect()) + expected = [ + ("1", 1, 1, 1, 4, 1, 1, 1, 1), + ("2", 1, 1, 1, 4, 2, 2, 2, 1), + ("2", 1, 2, 1, 4, 3, 2, 2, 2), + ("2", 2, 2, 2, 4, 4, 4, 3, 2) + ] + for r, ex in zip(rs, expected): + self.assertEqual(tuple(r), ex[:len(r)]) + + if __name__ == "__main__": unittest.main() diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 6f74b7162f7c..b0ac207418e7 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -168,10 +168,12 @@ def needConversion(self): return True def toInternal(self, d): - return d and d.toordinal() - self.EPOCH_ORDINAL + if d is not None: + return d.toordinal() - self.EPOCH_ORDINAL def fromInternal(self, v): - return v and datetime.date.fromordinal(v + self.EPOCH_ORDINAL) + if v is not None: + return datetime.date.fromordinal(v + self.EPOCH_ORDINAL) class TimestampType(AtomicType): @@ -467,9 +469,11 @@ def add(self, field, data_type=None, nullable=True, metadata=None): """ Construct a StructType by adding new elements to it to define the schema. The method accepts either: + a) A single parameter which is a StructField object. b) Between 2 and 4 parameters as (name, data_type, nullable (optional), - metadata(optional). The data_type parameter may be either a String or a DataType object + metadata(optional). The data_type parameter may be either a String or a + DataType object. >>> struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) >>> struct2 = StructType([StructField("f1", StringType(), True),\ @@ -535,6 +539,9 @@ def toInternal(self, obj): return tuple(f.toInternal(obj.get(n)) for n, f in zip(self.names, self.fields)) elif isinstance(obj, (tuple, list)): return tuple(f.toInternal(v) for f, v in zip(self.fields, obj)) + elif hasattr(obj, "__dict__"): + d = obj.__dict__ + return tuple(f.toInternal(d.get(n)) for n, f in zip(self.names, self.fields)) else: raise ValueError("Unexpected tuple %r with StructType" % obj) else: @@ -542,6 +549,9 @@ def toInternal(self, obj): return tuple(obj.get(n) for n in self.names) elif isinstance(obj, (list, tuple)): return tuple(obj) + elif hasattr(obj, "__dict__"): + d = obj.__dict__ + return tuple(d.get(n) for n in self.names) else: raise ValueError("Unexpected tuple %r with StructType" % obj) @@ -1197,13 +1207,36 @@ def __new__(self, *args, **kwargs): else: raise ValueError("No args or kwargs") - def asDict(self): + def asDict(self, recursive=False): """ Return as an dict + + :param recursive: turns the nested Row as dict (default: False). + + >>> Row(name="Alice", age=11).asDict() == {'name': 'Alice', 'age': 11} + True + >>> row = Row(key=1, value=Row(name='a', age=2)) + >>> row.asDict() == {'key': 1, 'value': Row(age=2, name='a')} + True + >>> row.asDict(True) == {'key': 1, 'value': {'name': 'a', 'age': 2}} + True """ if not hasattr(self, "__fields__"): raise TypeError("Cannot convert a Row class into dict") - return dict(zip(self.__fields__, self)) + + if recursive: + def conv(obj): + if isinstance(obj, Row): + return obj.asDict(True) + elif isinstance(obj, list): + return [conv(o) for o in obj] + elif isinstance(obj, dict): + return dict((k, conv(v)) for k, v in obj.items()) + else: + return obj + return dict(zip(self.__fields__, (conv(o) for o in self))) + else: + return dict(zip(self.__fields__, self)) # let object acts like class def __call__(self, *args): @@ -1223,6 +1256,11 @@ def __getattr__(self, item): except ValueError: raise AttributeError(item) + def __setattr__(self, key, value): + if key != '__fields__': + raise Exception("Row is read-only") + self.__dict__[key] = value + def __reduce__(self): """Returns a tuple so Python knows how to pickle Row.""" if hasattr(self, "__fields__"): diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py index c74745c726a0..eaf4d7e98620 100644 --- a/python/pyspark/sql/window.py +++ b/python/pyspark/sql/window.py @@ -64,7 +64,7 @@ def orderBy(*cols): Creates a :class:`WindowSpec` with the partitioning defined. """ sc = SparkContext._active_spark_context - jspec = sc._jvm.org.apache.spark.sql.expressions.Window.partitionBy(_to_java_cols(cols)) + jspec = sc._jvm.org.apache.spark.sql.expressions.Window.orderBy(_to_java_cols(cols)) return WindowSpec(jspec) diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py index 944fa414b0c0..0fee3b209682 100644 --- a/python/pyspark/statcounter.py +++ b/python/pyspark/statcounter.py @@ -30,7 +30,9 @@ class StatCounter(object): - def __init__(self, values=[]): + def __init__(self, values=None): + if values is None: + values = list() self.n = 0 # Running count of our values self.mu = 0.0 # Running mean of our values self.m2 = 0.0 # Running variance numerator (sum of (x - mean)^2) diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index ac5ba69e8dbb..4069d7a14998 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -86,6 +86,9 @@ class StreamingContext(object): """ _transformerSerializer = None + # Reference to a currently active StreamingContext + _activeContext = None + def __init__(self, sparkContext, batchDuration=None, jssc=None): """ Create a new StreamingContext. @@ -142,34 +145,84 @@ def getOrCreate(cls, checkpointPath, setupFunc): Either recreate a StreamingContext from checkpoint data or create a new StreamingContext. If checkpoint data exists in the provided `checkpointPath`, then StreamingContext will be recreated from the checkpoint data. If the data does not exist, then the provided setupFunc - will be used to create a JavaStreamingContext. + will be used to create a new context. - @param checkpointPath: Checkpoint directory used in an earlier JavaStreamingContext program - @param setupFunc: Function to create a new JavaStreamingContext and setup DStreams + @param checkpointPath: Checkpoint directory used in an earlier streaming program + @param setupFunc: Function to create a new context and setup DStreams """ - # TODO: support checkpoint in HDFS - if not os.path.exists(checkpointPath) or not os.listdir(checkpointPath): + cls._ensure_initialized() + gw = SparkContext._gateway + + # Check whether valid checkpoint information exists in the given path + if gw.jvm.CheckpointReader.read(checkpointPath).isEmpty(): ssc = setupFunc() ssc.checkpoint(checkpointPath) return ssc - cls._ensure_initialized() - gw = SparkContext._gateway - try: jssc = gw.jvm.JavaStreamingContext(checkpointPath) except Exception: print("failed to load StreamingContext from checkpoint", file=sys.stderr) raise - jsc = jssc.sparkContext() - conf = SparkConf(_jconf=jsc.getConf()) - sc = SparkContext(conf=conf, gateway=gw, jsc=jsc) + # If there is already an active instance of Python SparkContext use it, or create a new one + if not SparkContext._active_spark_context: + jsc = jssc.sparkContext() + conf = SparkConf(_jconf=jsc.getConf()) + SparkContext(conf=conf, gateway=gw, jsc=jsc) + + sc = SparkContext._active_spark_context + # update ctx in serializer - SparkContext._active_spark_context = sc cls._transformerSerializer.ctx = sc return StreamingContext(sc, None, jssc) + @classmethod + def getActive(cls): + """ + Return either the currently active StreamingContext (i.e., if there is a context started + but not stopped) or None. + """ + activePythonContext = cls._activeContext + if activePythonContext is not None: + # Verify that the current running Java StreamingContext is active and is the same one + # backing the supposedly active Python context + activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode() + activeJvmContextOption = activePythonContext._jvm.StreamingContext.getActive() + + if activeJvmContextOption.isEmpty(): + cls._activeContext = None + elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId: + cls._activeContext = None + raise Exception("JVM's active JavaStreamingContext is not the JavaStreamingContext " + "backing the action Python StreamingContext. This is unexpected.") + return cls._activeContext + + @classmethod + def getActiveOrCreate(cls, checkpointPath, setupFunc): + """ + Either return the active StreamingContext (i.e. currently started but not stopped), + or recreate a StreamingContext from checkpoint data or create a new StreamingContext + using the provided setupFunc function. If the checkpointPath is None or does not contain + valid checkpoint data, then setupFunc will be called to create a new context and setup + DStreams. + + @param checkpointPath: Checkpoint directory used in an earlier streaming program. Can be + None if the intention is to always create a new context when there + is no active context. + @param setupFunc: Function to create a new JavaStreamingContext and setup DStreams + """ + + if setupFunc is None: + raise Exception("setupFunc cannot be None") + activeContext = cls.getActive() + if activeContext is not None: + return activeContext + elif checkpointPath is not None: + return cls.getOrCreate(checkpointPath, setupFunc) + else: + return setupFunc() + @property def sparkContext(self): """ @@ -182,6 +235,7 @@ def start(self): Start the execution of the streams. """ self._jssc.start() + StreamingContext._activeContext = self def awaitTermination(self, timeout=None): """ @@ -212,6 +266,7 @@ def stop(self, stopSparkContext=True, stopGraceFully=False): of all received data to be completed """ self._jssc.stop(stopSparkContext, stopGraceFully) + StreamingContext._activeContext = None if stopSparkContext: self._sc.stop() diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py index 8dcb9645cdc6..698336cfce18 100644 --- a/python/pyspark/streaming/dstream.py +++ b/python/pyspark/streaming/dstream.py @@ -610,7 +610,10 @@ def __init__(self, prev, func): self.is_checkpointed = False self._jdstream_val = None - if (isinstance(prev, TransformedDStream) and + # Using type() to avoid folding the functions and compacting the DStreams which is not + # not strictly a object of TransformedDStream. + # Changed here is to avoid bug in KafkaTransformedDStream when calling offsetRanges(). + if (type(prev) is TransformedDStream and not prev.is_cached and not prev.is_checkpointed): prev_func = prev.func self.func = lambda t, rdd: func(t, prev_func(t, rdd)) diff --git a/python/pyspark/streaming/flume.py b/python/pyspark/streaming/flume.py index cbb573f226bb..c0cdc50d8d42 100644 --- a/python/pyspark/streaming/flume.py +++ b/python/pyspark/streaming/flume.py @@ -31,7 +31,9 @@ def utf8_decoder(s): """ Decode the unicode as UTF-8 """ - return s and s.decode('utf-8') + if s is None: + return None + return s.decode('utf-8') class FlumeUtils(object): diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py index 33dd596335b4..8a814c64c042 100644 --- a/python/pyspark/streaming/kafka.py +++ b/python/pyspark/streaming/kafka.py @@ -29,13 +29,15 @@ def utf8_decoder(s): """ Decode the unicode as UTF-8 """ - return s and s.decode('utf-8') + if s is None: + return None + return s.decode('utf-8') class KafkaUtils(object): @staticmethod - def createStream(ssc, zkQuorum, groupId, topics, kafkaParams={}, + def createStream(ssc, zkQuorum, groupId, topics, kafkaParams=None, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ @@ -52,6 +54,8 @@ def createStream(ssc, zkQuorum, groupId, topics, kafkaParams={}, :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A DStream object """ + if kafkaParams is None: + kafkaParams = dict() kafkaParams.update({ "zookeeper.connect": zkQuorum, "group.id": groupId, @@ -77,7 +81,7 @@ def createStream(ssc, zkQuorum, groupId, topics, kafkaParams={}, return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1]))) @staticmethod - def createDirectStream(ssc, topics, kafkaParams, fromOffsets={}, + def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ .. note:: Experimental @@ -105,6 +109,8 @@ def createDirectStream(ssc, topics, kafkaParams, fromOffsets={}, :param valueDecoder: A function used to decode value (default is utf8_decoder). :return: A DStream object """ + if fromOffsets is None: + fromOffsets = dict() if not isinstance(topics, list): raise TypeError("topics should be list") if not isinstance(kafkaParams, dict): @@ -129,7 +135,7 @@ def createDirectStream(ssc, topics, kafkaParams, fromOffsets={}, return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer) @staticmethod - def createRDD(sc, kafkaParams, offsetRanges, leaders={}, + def createRDD(sc, kafkaParams, offsetRanges, leaders=None, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ .. note:: Experimental @@ -145,6 +151,8 @@ def createRDD(sc, kafkaParams, offsetRanges, leaders={}, :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A RDD object """ + if leaders is None: + leaders = dict() if not isinstance(kafkaParams, dict): raise TypeError("kafkaParams should be dict") if not isinstance(offsetRanges, list): diff --git a/python/pyspark/streaming/kinesis.py b/python/pyspark/streaming/kinesis.py index bcfe2703fecf..34be5880e170 100644 --- a/python/pyspark/streaming/kinesis.py +++ b/python/pyspark/streaming/kinesis.py @@ -26,7 +26,9 @@ def utf8_decoder(s): """ Decode the unicode as UTF-8 """ - return s and s.decode('utf-8') + if s is None: + return None + return s.decode('utf-8') class KinesisUtils(object): diff --git a/python/pyspark/streaming/mqtt.py b/python/pyspark/streaming/mqtt.py new file mode 100644 index 000000000000..f06598971c54 --- /dev/null +++ b/python/pyspark/streaming/mqtt.py @@ -0,0 +1,72 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from py4j.java_gateway import Py4JJavaError + +from pyspark.storagelevel import StorageLevel +from pyspark.serializers import UTF8Deserializer +from pyspark.streaming import DStream + +__all__ = ['MQTTUtils'] + + +class MQTTUtils(object): + + @staticmethod + def createStream(ssc, brokerUrl, topic, + storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2): + """ + Create an input stream that pulls messages from a Mqtt Broker. + :param ssc: StreamingContext object + :param brokerUrl: Url of remote mqtt publisher + :param topic: topic name to subscribe to + :param storageLevel: RDD storage level. + :return: A DStream object + """ + jlevel = ssc._sc._getJavaStorageLevel(storageLevel) + + try: + helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ + .loadClass("org.apache.spark.streaming.mqtt.MQTTUtilsPythonHelper") + helper = helperClass.newInstance() + jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel) + except Py4JJavaError as e: + if 'ClassNotFoundException' in str(e.java_exception): + MQTTUtils._printErrorMsg(ssc.sparkContext) + raise e + + return DStream(jstream, ssc, UTF8Deserializer()) + + @staticmethod + def _printErrorMsg(sc): + print(""" +________________________________________________________________________________________________ + + Spark Streaming's MQTT libraries not found in class path. Try one of the following. + + 1. Include the MQTT library and its dependencies with in the + spark-submit command as + + $ bin/spark-submit --packages org.apache.spark:spark-streaming-mqtt:%s ... + + 2. Download the JAR of the artifact from Maven Central http://search.maven.org/, + Group Id = org.apache.spark, Artifact Id = spark-streaming-mqtt-assembly, Version = %s. + Then, include the jar in the spark-submit command as + + $ bin/spark-submit --jars ... +________________________________________________________________________________________________ +""" % (sc.version, sc.version)) diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 5cd544b2144e..cfea95b0dec7 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -24,6 +24,7 @@ import tempfile import random import struct +import shutil from functools import reduce if sys.version_info[:2] <= (2, 6): @@ -40,6 +41,7 @@ from pyspark.streaming.context import StreamingContext from pyspark.streaming.kafka import Broker, KafkaUtils, OffsetRange, TopicAndPartition from pyspark.streaming.flume import FlumeUtils +from pyspark.streaming.mqtt import MQTTUtils from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream @@ -58,12 +60,21 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): cls.sc.stop() + # Clean up in the JVM just in case there has been some issues in Python API + jSparkContextOption = SparkContext._jvm.SparkContext.get() + if jSparkContextOption.nonEmpty(): + jSparkContextOption.get().stop() def setUp(self): self.ssc = StreamingContext(self.sc, self.duration) def tearDown(self): - self.ssc.stop(False) + if self.ssc is not None: + self.ssc.stop(False) + # Clean up in the JVM just in case there has been some issues in Python API + jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive() + if jStreamingContextOption.nonEmpty(): + jStreamingContextOption.get().stop(False) def wait_for(self, result, n): start_time = time.time() @@ -441,6 +452,7 @@ def test_reduce_by_invalid_window(self): class StreamingContextTests(PySparkStreamingTestCase): duration = 0.1 + setupCalled = False def _add_input_stream(self): inputs = [range(1, x) for x in range(101)] @@ -514,10 +526,89 @@ def func(rdds): self.assertEqual([2, 3, 1], self._take(dstream, 3)) + def test_get_active(self): + self.assertEqual(StreamingContext.getActive(), None) + + # Verify that getActive() returns the active context + self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) + self.ssc.start() + self.assertEqual(StreamingContext.getActive(), self.ssc) + + # Verify that getActive() returns None + self.ssc.stop(False) + self.assertEqual(StreamingContext.getActive(), None) + + # Verify that if the Java context is stopped, then getActive() returns None + self.ssc = StreamingContext(self.sc, self.duration) + self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) + self.ssc.start() + self.assertEqual(StreamingContext.getActive(), self.ssc) + self.ssc._jssc.stop(False) + self.assertEqual(StreamingContext.getActive(), None) + + def test_get_active_or_create(self): + # Test StreamingContext.getActiveOrCreate() without checkpoint data + # See CheckpointTests for tests with checkpoint data + self.ssc = None + self.assertEqual(StreamingContext.getActive(), None) + + def setupFunc(): + ssc = StreamingContext(self.sc, self.duration) + ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) + self.setupCalled = True + return ssc + + # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active + self.setupCalled = False + self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) + self.assertTrue(self.setupCalled) + + # Verify that getActiveOrCreate() retuns active context and does not call the setupFunc + self.ssc.start() + self.setupCalled = False + self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) + self.assertFalse(self.setupCalled) + + # Verify that getActiveOrCreate() calls setupFunc after active context is stopped + self.ssc.stop(False) + self.setupCalled = False + self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) + self.assertTrue(self.setupCalled) + + # Verify that if the Java context is stopped, then getActive() returns None + self.ssc = StreamingContext(self.sc, self.duration) + self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) + self.ssc.start() + self.assertEqual(StreamingContext.getActive(), self.ssc) + self.ssc._jssc.stop(False) + self.setupCalled = False + self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) + self.assertTrue(self.setupCalled) + class CheckpointTests(unittest.TestCase): - def test_get_or_create(self): + setupCalled = False + + @staticmethod + def tearDownClass(): + # Clean up in the JVM just in case there has been some issues in Python API + jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive() + if jStreamingContextOption.nonEmpty(): + jStreamingContextOption.get().stop() + jSparkContextOption = SparkContext._jvm.SparkContext.get() + if jSparkContextOption.nonEmpty(): + jSparkContextOption.get().stop() + + def tearDown(self): + if self.ssc is not None: + self.ssc.stop(True) + if self.sc is not None: + self.sc.stop() + if self.cpd is not None: + shutil.rmtree(self.cpd) + + def test_get_or_create_and_get_active_or_create(self): inputd = tempfile.mkdtemp() outputd = tempfile.mkdtemp() + "/" @@ -532,11 +623,16 @@ def setup(): wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(.5) + self.setupCalled = True return ssc - cpd = tempfile.mkdtemp("test_streaming_cps") - ssc = StreamingContext.getOrCreate(cpd, setup) - ssc.start() + # Verify that getOrCreate() calls setup() in absence of checkpoint files + self.cpd = tempfile.mkdtemp("test_streaming_cps") + self.setupCalled = False + self.ssc = StreamingContext.getOrCreate(self.cpd, setup) + self.assertFalse(self.setupCalled) + + self.ssc.start() def check_output(n): while not os.listdir(outputd): @@ -551,7 +647,7 @@ def check_output(n): # not finished time.sleep(0.01) continue - ordd = ssc.sparkContext.textFile(p).map(lambda line: line.split(",")) + ordd = self.ssc.sparkContext.textFile(p).map(lambda line: line.split(",")) d = ordd.values().map(int).collect() if not d: time.sleep(0.01) @@ -567,13 +663,58 @@ def check_output(n): check_output(1) check_output(2) - ssc.stop(True, True) + # Verify the getOrCreate() recovers from checkpoint files + self.ssc.stop(True, True) time.sleep(1) - ssc = StreamingContext.getOrCreate(cpd, setup) - ssc.start() + self.setupCalled = False + self.ssc = StreamingContext.getOrCreate(self.cpd, setup) + self.assertFalse(self.setupCalled) + self.ssc.start() check_output(3) - ssc.stop(True, True) + + # Verify that getOrCreate() uses existing SparkContext + self.ssc.stop(True, True) + time.sleep(1) + sc = SparkContext(SparkConf()) + self.setupCalled = False + self.ssc = StreamingContext.getOrCreate(self.cpd, setup) + self.assertFalse(self.setupCalled) + self.assertTrue(self.ssc.sparkContext == sc) + + # Verify the getActiveOrCreate() recovers from checkpoint files + self.ssc.stop(True, True) + time.sleep(1) + self.setupCalled = False + self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) + self.assertFalse(self.setupCalled) + self.ssc.start() + check_output(4) + + # Verify that getActiveOrCreate() returns active context + self.setupCalled = False + self.assertEquals(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc) + self.assertFalse(self.setupCalled) + + # Verify that getActiveOrCreate() uses existing SparkContext + self.ssc.stop(True, True) + time.sleep(1) + self.sc = SparkContext(SparkConf()) + self.setupCalled = False + self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) + self.assertFalse(self.setupCalled) + self.assertTrue(self.ssc.sparkContext == sc) + + # Verify that getActiveOrCreate() calls setup() in absence of checkpoint files + self.ssc.stop(True, True) + shutil.rmtree(self.cpd) # delete checkpoint directory + time.sleep(1) + self.setupCalled = False + self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) + self.assertTrue(self.setupCalled) + + # Stop everything + self.ssc.stop(True, True) class KafkaStreamTests(PySparkStreamingTestCase): @@ -738,7 +879,9 @@ def transformWithOffsetRanges(rdd): offsetRanges.append(o) return rdd - stream.transform(transformWithOffsetRanges).foreachRDD(lambda rdd: rdd.count()) + # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together, + # only the TransformedDstreams can be folded together. + stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint() self.ssc.start() self.wait_for(offsetRanges, 1) @@ -893,6 +1036,68 @@ def test_flume_polling_multiple_hosts(self): self._testMultipleTimes(self._testFlumePollingMultipleHosts) +class MQTTStreamTests(PySparkStreamingTestCase): + timeout = 20 # seconds + duration = 1 + + def setUp(self): + super(MQTTStreamTests, self).setUp() + + MQTTTestUtilsClz = self.ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ + .loadClass("org.apache.spark.streaming.mqtt.MQTTTestUtils") + self._MQTTTestUtils = MQTTTestUtilsClz.newInstance() + self._MQTTTestUtils.setup() + + def tearDown(self): + if self._MQTTTestUtils is not None: + self._MQTTTestUtils.teardown() + self._MQTTTestUtils = None + + super(MQTTStreamTests, self).tearDown() + + def _randomTopic(self): + return "topic-%d" % random.randint(0, 10000) + + def _startContext(self, topic): + # Start the StreamingContext and also collect the result + stream = MQTTUtils.createStream(self.ssc, "tcp://" + self._MQTTTestUtils.brokerUri(), topic) + result = [] + + def getOutput(_, rdd): + for data in rdd.collect(): + result.append(data) + + stream.foreachRDD(getOutput) + self.ssc.start() + return result + + def test_mqtt_stream(self): + """Test the Python MQTT stream API.""" + sendData = "MQTT demo for spark streaming" + topic = self._randomTopic() + result = self._startContext(topic) + + def retry(): + self._MQTTTestUtils.publishData(topic, sendData) + # Because "publishData" sends duplicate messages, here we should use > 0 + self.assertTrue(len(result) > 0) + self.assertEqual(sendData, result[0]) + + # Retry it because we don't know when the receiver will start. + self._retry_or_timeout(retry) + + def _retry_or_timeout(self, test_func): + start_time = time.time() + while True: + try: + test_func() + break + except: + if time.time() - start_time > self.timeout: + raise + time.sleep(0.01) + + class KinesisStreamTests(PySparkStreamingTestCase): def test_kinesis_stream_api(self): @@ -908,8 +1113,10 @@ def test_kinesis_stream_api(self): "awsAccessKey", "awsSecretKey") def test_kinesis_stream(self): - if os.environ.get('ENABLE_KINESIS_TESTS') != '1': - print("Skip test_kinesis_stream") + if not are_kinesis_tests_enabled: + sys.stderr.write( + "Skipped test_kinesis_stream (enable by setting environment variable %s=1" + % kinesis_test_environ_var) return import random @@ -950,24 +1157,34 @@ def get_output(_, rdd): traceback.print_exc() raise finally: + self.ssc.stop(False) kinesisTestUtils.deleteStream() kinesisTestUtils.deleteDynamoDBTable(kinesisAppName) +# Search jar in the project dir using the jar name_prefix for both sbt build and maven build because +# the artifact jars are in different directories. +def search_jar(dir, name_prefix): + # We should ignore the following jars + ignored_jar_suffixes = ("javadoc.jar", "sources.jar", "test-sources.jar", "tests.jar") + jars = (glob.glob(os.path.join(dir, "target/scala-*/" + name_prefix + "-*.jar")) + # sbt build + glob.glob(os.path.join(dir, "target/" + name_prefix + "_*.jar"))) # maven build + return [jar for jar in jars if not jar.endswith(ignored_jar_suffixes)] + + def search_kafka_assembly_jar(): SPARK_HOME = os.environ["SPARK_HOME"] kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-assembly") - jars = glob.glob( - os.path.join(kafka_assembly_dir, "target/scala-*/spark-streaming-kafka-assembly-*.jar")) + jars = search_jar(kafka_assembly_dir, "spark-streaming-kafka-assembly") if not jars: raise Exception( ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) + "You need to build Spark with " "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or " - "'build/mvn package' before running this test") + "'build/mvn package' before running this test.") elif len(jars) > 1: - raise Exception(("Found multiple Spark Streaming Kafka assembly JARs in %s; please " - "remove all but one") % kafka_assembly_dir) + raise Exception(("Found multiple Spark Streaming Kafka assembly JARs: %s; please " + "remove all but one") % (", ".join(jars))) else: return jars[0] @@ -975,45 +1192,109 @@ def search_kafka_assembly_jar(): def search_flume_assembly_jar(): SPARK_HOME = os.environ["SPARK_HOME"] flume_assembly_dir = os.path.join(SPARK_HOME, "external/flume-assembly") - jars = glob.glob( - os.path.join(flume_assembly_dir, "target/scala-*/spark-streaming-flume-assembly-*.jar")) + jars = search_jar(flume_assembly_dir, "spark-streaming-flume-assembly") if not jars: raise Exception( ("Failed to find Spark Streaming Flume assembly jar in %s. " % flume_assembly_dir) + "You need to build Spark with " "'build/sbt assembly/assembly streaming-flume-assembly/assembly' or " + "'build/mvn package' before running this test.") + elif len(jars) > 1: + raise Exception(("Found multiple Spark Streaming Flume assembly JARs: %s; please " + "remove all but one") % (", ".join(jars))) + else: + return jars[0] + + +def search_mqtt_assembly_jar(): + SPARK_HOME = os.environ["SPARK_HOME"] + mqtt_assembly_dir = os.path.join(SPARK_HOME, "external/mqtt-assembly") + jars = search_jar(mqtt_assembly_dir, "spark-streaming-mqtt-assembly") + if not jars: + raise Exception( + ("Failed to find Spark Streaming MQTT assembly jar in %s. " % mqtt_assembly_dir) + + "You need to build Spark with " + "'build/sbt assembly/assembly streaming-mqtt-assembly/assembly' or " "'build/mvn package' before running this test") elif len(jars) > 1: - raise Exception(("Found multiple Spark Streaming Flume assembly JARs in %s; please " - "remove all but one") % flume_assembly_dir) + raise Exception(("Found multiple Spark Streaming MQTT assembly JARs: %s; please " + "remove all but one") % (", ".join(jars))) else: return jars[0] -def search_kinesis_asl_assembly_jar(): +def search_mqtt_test_jar(): SPARK_HOME = os.environ["SPARK_HOME"] - kinesis_asl_assembly_dir = os.path.join(SPARK_HOME, "extras/kinesis-asl-assembly") + mqtt_test_dir = os.path.join(SPARK_HOME, "external/mqtt") jars = glob.glob( - os.path.join(kinesis_asl_assembly_dir, - "target/scala-*/spark-streaming-kinesis-asl-assembly-*.jar")) + os.path.join(mqtt_test_dir, "target/scala-*/spark-streaming-mqtt-test-*.jar")) if not jars: raise Exception( - ("Failed to find Spark Streaming Kinesis ASL assembly jar in %s. " % - kinesis_asl_assembly_dir) + "You need to build Spark with " - "'build/sbt -Pkinesis-asl assembly/assembly streaming-kinesis-asl-assembly/assembly' " - "or 'build/mvn -Pkinesis-asl package' before running this test") + ("Failed to find Spark Streaming MQTT test jar in %s. " % mqtt_test_dir) + + "You need to build Spark with " + "'build/sbt assembly/assembly streaming-mqtt/test:assembly'") + elif len(jars) > 1: + raise Exception(("Found multiple Spark Streaming MQTT test JARs: %s; please " + "remove all but one") % (", ".join(jars))) + else: + return jars[0] + + +def search_kinesis_asl_assembly_jar(): + SPARK_HOME = os.environ["SPARK_HOME"] + kinesis_asl_assembly_dir = os.path.join(SPARK_HOME, "extras/kinesis-asl-assembly") + jars = search_jar(kinesis_asl_assembly_dir, "spark-streaming-kinesis-asl-assembly") + if not jars: + return None elif len(jars) > 1: - raise Exception(("Found multiple Spark Streaming Kinesis ASL assembly JARs in %s; please " - "remove all but one") % kinesis_asl_assembly_dir) + raise Exception(("Found multiple Spark Streaming Kinesis ASL assembly JARs: %s; please " + "remove all but one") % (", ".join(jars))) else: return jars[0] +# Must be same as the variable and condition defined in KinesisTestUtils.scala +kinesis_test_environ_var = "ENABLE_KINESIS_TESTS" +are_kinesis_tests_enabled = os.environ.get(kinesis_test_environ_var) == '1' + if __name__ == "__main__": kafka_assembly_jar = search_kafka_assembly_jar() flume_assembly_jar = search_flume_assembly_jar() + mqtt_assembly_jar = search_mqtt_assembly_jar() + mqtt_test_jar = search_mqtt_test_jar() kinesis_asl_assembly_jar = search_kinesis_asl_assembly_jar() - jars = "%s,%s,%s" % (kafka_assembly_jar, flume_assembly_jar, kinesis_asl_assembly_jar) + + if kinesis_asl_assembly_jar is None: + kinesis_jar_present = False + jars = "%s,%s,%s,%s" % (kafka_assembly_jar, flume_assembly_jar, mqtt_assembly_jar, + mqtt_test_jar) + else: + kinesis_jar_present = True + jars = "%s,%s,%s,%s,%s" % (kafka_assembly_jar, flume_assembly_jar, mqtt_assembly_jar, + mqtt_test_jar, kinesis_asl_assembly_jar) os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars - unittest.main() + testcases = [BasicOperationTests, WindowFunctionTests, StreamingContextTests, CheckpointTests, + KafkaStreamTests, FlumeStreamTests, FlumePollingStreamTests, MQTTStreamTests] + + if kinesis_jar_present is True: + testcases.append(KinesisStreamTests) + elif are_kinesis_tests_enabled is False: + sys.stderr.write("Skipping all Kinesis Python tests as the optional Kinesis project was " + "not compiled into a JAR. To run these tests, " + "you need to build Spark with 'build/sbt -Pkinesis-asl assembly/assembly " + "streaming-kinesis-asl-assembly/assembly' or " + "'build/mvn -Pkinesis-asl package' before running this test.") + else: + raise Exception( + ("Failed to find Spark Streaming Kinesis assembly jar in %s. " + % kinesis_asl_assembly_dir) + + "You need to build Spark with 'build/sbt -Pkinesis-asl " + "assembly/assembly streaming-kinesis-asl-assembly/assembly'" + "or 'build/mvn -Pkinesis-asl package' before running this test.") + + sys.stderr.write("Running tests: %s \n" % (str(testcases))) + for testcase in testcases: + sys.stderr.write("[Running %s]\n" % (testcase)) + tests = unittest.TestLoader().loadTestsFromTestCase(testcase) + unittest.TextTestRunner(verbosity=3).run(tests) diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 8bfed074c905..647504c32f15 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -218,6 +218,11 @@ def test_namedtuple(self): p2 = loads(dumps(p1, 2)) self.assertEqual(p1, p2) + from pyspark.cloudpickle import dumps + P2 = loads(dumps(P)) + p3 = P2(1, 3) + self.assertEqual(p1, p3) + def test_itemgetter(self): from operator import itemgetter ser = CloudPickleSerializer() diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 93df9002be37..42c2f8b75933 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -146,5 +146,5 @@ def process(): java_port = int(sys.stdin.readline()) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(("127.0.0.1", java_port)) - sock_file = sock.makefile("a+", 65536) + sock_file = sock.makefile("rwb", 65536) main(sock_file, sock_file) diff --git a/python/run-tests.py b/python/run-tests.py index cc560779373b..fd56c7ab6e0e 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -158,7 +158,7 @@ def main(): else: log_level = logging.INFO logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s") - LOGGER.info("Running PySpark tests. Output is in python/%s", LOG_FILE) + LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE) if os.path.exists(LOG_FILE): os.remove(LOG_FILE) python_execs = opts.python_executables.split(',') diff --git a/repl/pom.xml b/repl/pom.xml index a5a0f1fc2c85..55736215a5d2 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala index 8130868fe148..304b1e8cdbed 100644 --- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala +++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala @@ -981,7 +981,7 @@ class SparkILoop( // which spins off a separate thread, then print the prompt and try // our best to look ready. The interlocking lazy vals tend to // inter-deadlock, so we break the cycle with a single asynchronous - // message to an actor. + // message to an rpcEndpoint. if (isAsync) { intp initialize initializedCallback() createAsyncListener() // listens for signal to run postInitialization diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala index be31eb2eda54..627148df80c1 100644 --- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala +++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala @@ -35,7 +35,8 @@ object Main extends Logging { s.processArguments(List("-Yrepl-class-based", "-Yrepl-outdir", s"${outputDir.getAbsolutePath}", "-classpath", getAddedJars.mkString(File.pathSeparator)), true) - val classServer = new HttpServer(conf, outputDir, new SecurityManager(conf)) + // the creation of SecurityManager has to be lazy so SPARK_YARN_MODE is set if needed + lazy val classServer = new HttpServer(conf, outputDir, new SecurityManager(conf)) var sparkContext: SparkContext = _ var sqlContext: SQLContext = _ var interp = new SparkILoop // this is a public var because tests reset it. diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh index de762acc8fa0..0fbe795822fb 100755 --- a/sbin/spark-daemon.sh +++ b/sbin/spark-daemon.sh @@ -29,7 +29,7 @@ # SPARK_NICENESS The scheduling priority for daemons. Defaults to 0. ## -usage="Usage: spark-daemon.sh [--config ] (start|stop|status) " +usage="Usage: spark-daemon.sh [--config ] (start|stop|submit|status) " # if no args specified, show usage if [ $# -le 1 ]; then diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index f4b1cc3a4ffe..0da46f2362c0 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml @@ -66,7 +66,6 @@ org.codehaus.janino janino - 2.7.8 diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java index 0374846d7167..501dff090313 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.*; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.hash.Murmur3_x86_32; import org.apache.spark.unsafe.types.CalendarInterval; @@ -59,7 +59,7 @@ public class UnsafeArrayData extends ArrayData { private int sizeInBytes; private int getElementOffset(int ordinal) { - return PlatformDependent.UNSAFE.getInt(baseObject, baseOffset + ordinal * 4L); + return Platform.getInt(baseObject, baseOffset + ordinal * 4L); } private int getElementSize(int offset, int ordinal) { @@ -157,7 +157,7 @@ public boolean getBoolean(int ordinal) { assertIndexIsValid(ordinal); final int offset = getElementOffset(ordinal); if (offset < 0) return false; - return PlatformDependent.UNSAFE.getBoolean(baseObject, baseOffset + offset); + return Platform.getBoolean(baseObject, baseOffset + offset); } @Override @@ -165,7 +165,7 @@ public byte getByte(int ordinal) { assertIndexIsValid(ordinal); final int offset = getElementOffset(ordinal); if (offset < 0) return 0; - return PlatformDependent.UNSAFE.getByte(baseObject, baseOffset + offset); + return Platform.getByte(baseObject, baseOffset + offset); } @Override @@ -173,7 +173,7 @@ public short getShort(int ordinal) { assertIndexIsValid(ordinal); final int offset = getElementOffset(ordinal); if (offset < 0) return 0; - return PlatformDependent.UNSAFE.getShort(baseObject, baseOffset + offset); + return Platform.getShort(baseObject, baseOffset + offset); } @Override @@ -181,7 +181,7 @@ public int getInt(int ordinal) { assertIndexIsValid(ordinal); final int offset = getElementOffset(ordinal); if (offset < 0) return 0; - return PlatformDependent.UNSAFE.getInt(baseObject, baseOffset + offset); + return Platform.getInt(baseObject, baseOffset + offset); } @Override @@ -189,7 +189,7 @@ public long getLong(int ordinal) { assertIndexIsValid(ordinal); final int offset = getElementOffset(ordinal); if (offset < 0) return 0; - return PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offset); + return Platform.getLong(baseObject, baseOffset + offset); } @Override @@ -197,7 +197,7 @@ public float getFloat(int ordinal) { assertIndexIsValid(ordinal); final int offset = getElementOffset(ordinal); if (offset < 0) return 0; - return PlatformDependent.UNSAFE.getFloat(baseObject, baseOffset + offset); + return Platform.getFloat(baseObject, baseOffset + offset); } @Override @@ -205,7 +205,7 @@ public double getDouble(int ordinal) { assertIndexIsValid(ordinal); final int offset = getElementOffset(ordinal); if (offset < 0) return 0; - return PlatformDependent.UNSAFE.getDouble(baseObject, baseOffset + offset); + return Platform.getDouble(baseObject, baseOffset + offset); } @Override @@ -215,7 +215,7 @@ public Decimal getDecimal(int ordinal, int precision, int scale) { if (offset < 0) return null; if (precision <= Decimal.MAX_LONG_DIGITS()) { - final long value = PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offset); + final long value = Platform.getLong(baseObject, baseOffset + offset); return Decimal.apply(value, precision, scale); } else { final byte[] bytes = getBinary(ordinal); @@ -241,12 +241,7 @@ public byte[] getBinary(int ordinal) { if (offset < 0) return null; final int size = getElementSize(offset, ordinal); final byte[] bytes = new byte[size]; - PlatformDependent.copyMemory( - baseObject, - baseOffset + offset, - bytes, - PlatformDependent.BYTE_ARRAY_OFFSET, - size); + Platform.copyMemory(baseObject, baseOffset + offset, bytes, Platform.BYTE_ARRAY_OFFSET, size); return bytes; } @@ -255,9 +250,8 @@ public CalendarInterval getInterval(int ordinal) { assertIndexIsValid(ordinal); final int offset = getElementOffset(ordinal); if (offset < 0) return null; - final int months = (int) PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offset); - final long microseconds = - PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offset + 8); + final int months = (int) Platform.getLong(baseObject, baseOffset + offset); + final long microseconds = Platform.getLong(baseObject, baseOffset + offset + 8); return new CalendarInterval(months, microseconds); } @@ -307,27 +301,16 @@ public boolean equals(Object other) { } public void writeToMemory(Object target, long targetOffset) { - PlatformDependent.copyMemory( - baseObject, - baseOffset, - target, - targetOffset, - sizeInBytes - ); + Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes); } @Override public UnsafeArrayData copy() { UnsafeArrayData arrayCopy = new UnsafeArrayData(); final byte[] arrayDataCopy = new byte[sizeInBytes]; - PlatformDependent.copyMemory( - baseObject, - baseOffset, - arrayDataCopy, - PlatformDependent.BYTE_ARRAY_OFFSET, - sizeInBytes - ); - arrayCopy.pointTo(arrayDataCopy, PlatformDependent.BYTE_ARRAY_OFFSET, numElements, sizeInBytes); + Platform.copyMemory( + baseObject, baseOffset, arrayDataCopy, Platform.BYTE_ARRAY_OFFSET, sizeInBytes); + arrayCopy.pointTo(arrayDataCopy, Platform.BYTE_ARRAY_OFFSET, numElements, sizeInBytes); return arrayCopy; } } diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java index b521b703389d..7b03185a30e3 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java @@ -17,13 +17,13 @@ package org.apache.spark.sql.catalyst.expressions; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; public class UnsafeReaders { public static UnsafeArrayData readArray(Object baseObject, long baseOffset, int numBytes) { // Read the number of elements from first 4 bytes. - final int numElements = PlatformDependent.UNSAFE.getInt(baseObject, baseOffset); + final int numElements = Platform.getInt(baseObject, baseOffset); final UnsafeArrayData array = new UnsafeArrayData(); // Skip the first 4 bytes. array.pointTo(baseObject, baseOffset + 4, numElements, numBytes - 4); @@ -32,9 +32,9 @@ public static UnsafeArrayData readArray(Object baseObject, long baseOffset, int public static UnsafeMapData readMap(Object baseObject, long baseOffset, int numBytes) { // Read the number of elements from first 4 bytes. - final int numElements = PlatformDependent.UNSAFE.getInt(baseObject, baseOffset); + final int numElements = Platform.getInt(baseObject, baseOffset); // Read the numBytes of key array in second 4 bytes. - final int keyArraySize = PlatformDependent.UNSAFE.getInt(baseObject, baseOffset + 4); + final int keyArraySize = Platform.getInt(baseObject, baseOffset + 4); final int valueArraySize = numBytes - 8 - keyArraySize; final UnsafeArrayData keyArray = new UnsafeArrayData(); diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java index f4230cfaba37..6c020045c311 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java @@ -27,7 +27,7 @@ import java.util.Set; import org.apache.spark.sql.types.*; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.bitset.BitSetMethods; import org.apache.spark.unsafe.hash.Murmur3_x86_32; @@ -59,17 +59,17 @@ public final class UnsafeRow extends MutableRow { ////////////////////////////////////////////////////////////////////////////// public static int calculateBitSetWidthInBytes(int numFields) { - return ((numFields / 64) + (numFields % 64 == 0 ? 0 : 1)) * 8; + return ((numFields + 63)/ 64) * 8; } /** * Field types that can be updated in place in UnsafeRows (e.g. we support set() for these types) */ - public static final Set settableFieldTypes; + public static final Set mutableFieldTypes; - // DecimalType(precision <= 18) is settable + // DecimalType is also mutable static { - settableFieldTypes = Collections.unmodifiableSet( + mutableFieldTypes = Collections.unmodifiableSet( new HashSet<>( Arrays.asList(new DataType[] { NullType, @@ -87,12 +87,16 @@ public static int calculateBitSetWidthInBytes(int numFields) { public static boolean isFixedLength(DataType dt) { if (dt instanceof DecimalType) { - return ((DecimalType) dt).precision() < Decimal.MAX_LONG_DIGITS(); + return ((DecimalType) dt).precision() <= Decimal.MAX_LONG_DIGITS(); } else { - return settableFieldTypes.contains(dt); + return mutableFieldTypes.contains(dt); } } + public static boolean isMutable(DataType dt) { + return mutableFieldTypes.contains(dt) || dt instanceof DecimalType; + } + ////////////////////////////////////////////////////////////////////////////// // Private fields and methods ////////////////////////////////////////////////////////////////////////////// @@ -165,7 +169,7 @@ public void pointTo(Object baseObject, long baseOffset, int numFields, int sizeI * @param sizeInBytes the number of bytes valid in the byte array */ public void pointTo(byte[] buf, int numFields, int sizeInBytes) { - pointTo(buf, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, sizeInBytes); + pointTo(buf, Platform.BYTE_ARRAY_OFFSET, numFields, sizeInBytes); } @Override @@ -175,7 +179,7 @@ public void setNullAt(int i) { // To preserve row equality, zero out the value when setting the column to null. // Since this row does does not currently support updates to variable-length values, we don't // have to worry about zeroing out that data. - PlatformDependent.UNSAFE.putLong(baseObject, getFieldOffset(i), 0); + Platform.putLong(baseObject, getFieldOffset(i), 0); } @Override @@ -187,14 +191,14 @@ public void update(int ordinal, Object value) { public void setInt(int ordinal, int value) { assertIndexIsValid(ordinal); setNotNullAt(ordinal); - PlatformDependent.UNSAFE.putInt(baseObject, getFieldOffset(ordinal), value); + Platform.putInt(baseObject, getFieldOffset(ordinal), value); } @Override public void setLong(int ordinal, long value) { assertIndexIsValid(ordinal); setNotNullAt(ordinal); - PlatformDependent.UNSAFE.putLong(baseObject, getFieldOffset(ordinal), value); + Platform.putLong(baseObject, getFieldOffset(ordinal), value); } @Override @@ -204,28 +208,28 @@ public void setDouble(int ordinal, double value) { if (Double.isNaN(value)) { value = Double.NaN; } - PlatformDependent.UNSAFE.putDouble(baseObject, getFieldOffset(ordinal), value); + Platform.putDouble(baseObject, getFieldOffset(ordinal), value); } @Override public void setBoolean(int ordinal, boolean value) { assertIndexIsValid(ordinal); setNotNullAt(ordinal); - PlatformDependent.UNSAFE.putBoolean(baseObject, getFieldOffset(ordinal), value); + Platform.putBoolean(baseObject, getFieldOffset(ordinal), value); } @Override public void setShort(int ordinal, short value) { assertIndexIsValid(ordinal); setNotNullAt(ordinal); - PlatformDependent.UNSAFE.putShort(baseObject, getFieldOffset(ordinal), value); + Platform.putShort(baseObject, getFieldOffset(ordinal), value); } @Override public void setByte(int ordinal, byte value) { assertIndexIsValid(ordinal); setNotNullAt(ordinal); - PlatformDependent.UNSAFE.putByte(baseObject, getFieldOffset(ordinal), value); + Platform.putByte(baseObject, getFieldOffset(ordinal), value); } @Override @@ -235,29 +239,51 @@ public void setFloat(int ordinal, float value) { if (Float.isNaN(value)) { value = Float.NaN; } - PlatformDependent.UNSAFE.putFloat(baseObject, getFieldOffset(ordinal), value); + Platform.putFloat(baseObject, getFieldOffset(ordinal), value); } + /** + * Updates the decimal column. + * + * Note: In order to support update a decimal with precision > 18, CAN NOT call + * setNullAt() for this column. + */ @Override public void setDecimal(int ordinal, Decimal value, int precision) { assertIndexIsValid(ordinal); - if (value == null) { - setNullAt(ordinal); - } else { - if (precision <= Decimal.MAX_LONG_DIGITS()) { + if (precision <= Decimal.MAX_LONG_DIGITS()) { + // compact format + if (value == null) { + setNullAt(ordinal); + } else { setLong(ordinal, value.toUnscaledLong()); + } + } else { + // fixed length + long cursor = getLong(ordinal) >>> 32; + assert cursor > 0 : "invalid cursor " + cursor; + // zero-out the bytes + Platform.putLong(baseObject, baseOffset + cursor, 0L); + Platform.putLong(baseObject, baseOffset + cursor + 8, 0L); + + if (value == null) { + setNullAt(ordinal); + // keep the offset for future update + Platform.putLong(baseObject, getFieldOffset(ordinal), cursor << 32); } else { - // TODO(davies): support update decimal (hold a bounded space even it's null) - throw new UnsupportedOperationException(); + + final BigInteger integer = value.toJavaBigDecimal().unscaledValue(); + byte[] bytes = integer.toByteArray(); + assert(bytes.length <= 16); + + // Write the bytes to the variable length portion. + Platform.copyMemory( + bytes, Platform.BYTE_ARRAY_OFFSET, baseObject, baseOffset + cursor, bytes.length); + setLong(ordinal, (cursor << 32) | ((long) bytes.length)); } } } - @Override - public Object genericGet(int ordinal) { - throw new UnsupportedOperationException(); - } - @Override public Object get(int ordinal, DataType dataType) { if (isNullAt(ordinal) || dataType instanceof NullType) { @@ -309,43 +335,43 @@ public boolean isNullAt(int ordinal) { @Override public boolean getBoolean(int ordinal) { assertIndexIsValid(ordinal); - return PlatformDependent.UNSAFE.getBoolean(baseObject, getFieldOffset(ordinal)); + return Platform.getBoolean(baseObject, getFieldOffset(ordinal)); } @Override public byte getByte(int ordinal) { assertIndexIsValid(ordinal); - return PlatformDependent.UNSAFE.getByte(baseObject, getFieldOffset(ordinal)); + return Platform.getByte(baseObject, getFieldOffset(ordinal)); } @Override public short getShort(int ordinal) { assertIndexIsValid(ordinal); - return PlatformDependent.UNSAFE.getShort(baseObject, getFieldOffset(ordinal)); + return Platform.getShort(baseObject, getFieldOffset(ordinal)); } @Override public int getInt(int ordinal) { assertIndexIsValid(ordinal); - return PlatformDependent.UNSAFE.getInt(baseObject, getFieldOffset(ordinal)); + return Platform.getInt(baseObject, getFieldOffset(ordinal)); } @Override public long getLong(int ordinal) { assertIndexIsValid(ordinal); - return PlatformDependent.UNSAFE.getLong(baseObject, getFieldOffset(ordinal)); + return Platform.getLong(baseObject, getFieldOffset(ordinal)); } @Override public float getFloat(int ordinal) { assertIndexIsValid(ordinal); - return PlatformDependent.UNSAFE.getFloat(baseObject, getFieldOffset(ordinal)); + return Platform.getFloat(baseObject, getFieldOffset(ordinal)); } @Override public double getDouble(int ordinal) { assertIndexIsValid(ordinal); - return PlatformDependent.UNSAFE.getDouble(baseObject, getFieldOffset(ordinal)); + return Platform.getDouble(baseObject, getFieldOffset(ordinal)); } @Override @@ -359,7 +385,7 @@ public Decimal getDecimal(int ordinal, int precision, int scale) { byte[] bytes = getBinary(ordinal); BigInteger bigInteger = new BigInteger(bytes); BigDecimal javaDecimal = new BigDecimal(bigInteger, scale); - return Decimal.apply(new scala.math.BigDecimal(javaDecimal), precision, scale); + return Decimal.apply(javaDecimal, precision, scale); } } @@ -381,11 +407,11 @@ public byte[] getBinary(int ordinal) { final int offset = (int) (offsetAndSize >> 32); final int size = (int) (offsetAndSize & ((1L << 32) - 1)); final byte[] bytes = new byte[size]; - PlatformDependent.copyMemory( + Platform.copyMemory( baseObject, baseOffset + offset, bytes, - PlatformDependent.BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, size ); return bytes; @@ -399,9 +425,8 @@ public CalendarInterval getInterval(int ordinal) { } else { final long offsetAndSize = getLong(ordinal); final int offset = (int) (offsetAndSize >> 32); - final int months = (int) PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offset); - final long microseconds = - PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offset + 8); + final int months = (int) Platform.getLong(baseObject, baseOffset + offset); + final long microseconds = Platform.getLong(baseObject, baseOffset + offset + 8); return new CalendarInterval(months, microseconds); } } @@ -452,14 +477,14 @@ public MapData getMap(int ordinal) { public UnsafeRow copy() { UnsafeRow rowCopy = new UnsafeRow(); final byte[] rowDataCopy = new byte[sizeInBytes]; - PlatformDependent.copyMemory( + Platform.copyMemory( baseObject, baseOffset, rowDataCopy, - PlatformDependent.BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, sizeInBytes ); - rowCopy.pointTo(rowDataCopy, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, sizeInBytes); + rowCopy.pointTo(rowDataCopy, Platform.BYTE_ARRAY_OFFSET, numFields, sizeInBytes); return rowCopy; } @@ -479,18 +504,13 @@ public static UnsafeRow createFromByteArray(int numBytes, int numFields) { */ public void copyFrom(UnsafeRow row) { // copyFrom is only available for UnsafeRow created from byte array. - assert (baseObject instanceof byte[]) && baseOffset == PlatformDependent.BYTE_ARRAY_OFFSET; + assert (baseObject instanceof byte[]) && baseOffset == Platform.BYTE_ARRAY_OFFSET; if (row.sizeInBytes > this.sizeInBytes) { // resize the underlying byte[] if it's not large enough. this.baseObject = new byte[row.sizeInBytes]; } - PlatformDependent.copyMemory( - row.baseObject, - row.baseOffset, - this.baseObject, - this.baseOffset, - row.sizeInBytes - ); + Platform.copyMemory( + row.baseObject, row.baseOffset, this.baseObject, this.baseOffset, row.sizeInBytes); // update the sizeInBytes. this.sizeInBytes = row.sizeInBytes; } @@ -505,19 +525,15 @@ public void copyFrom(UnsafeRow row) { */ public void writeToStream(OutputStream out, byte[] writeBuffer) throws IOException { if (baseObject instanceof byte[]) { - int offsetInByteArray = (int) (PlatformDependent.BYTE_ARRAY_OFFSET - baseOffset); + int offsetInByteArray = (int) (Platform.BYTE_ARRAY_OFFSET - baseOffset); out.write((byte[]) baseObject, offsetInByteArray, sizeInBytes); } else { int dataRemaining = sizeInBytes; long rowReadPosition = baseOffset; while (dataRemaining > 0) { int toTransfer = Math.min(writeBuffer.length, dataRemaining); - PlatformDependent.copyMemory( - baseObject, - rowReadPosition, - writeBuffer, - PlatformDependent.BYTE_ARRAY_OFFSET, - toTransfer); + Platform.copyMemory( + baseObject, rowReadPosition, writeBuffer, Platform.BYTE_ARRAY_OFFSET, toTransfer); out.write(writeBuffer, 0, toTransfer); rowReadPosition += toTransfer; dataRemaining -= toTransfer; @@ -545,13 +561,12 @@ public boolean equals(Object other) { * Returns the underlying bytes for this UnsafeRow. */ public byte[] getBytes() { - if (baseObject instanceof byte[] && baseOffset == PlatformDependent.BYTE_ARRAY_OFFSET + if (baseObject instanceof byte[] && baseOffset == Platform.BYTE_ARRAY_OFFSET && (((byte[]) baseObject).length == sizeInBytes)) { return (byte[]) baseObject; } else { byte[] bytes = new byte[sizeInBytes]; - PlatformDependent.copyMemory(baseObject, baseOffset, bytes, - PlatformDependent.BYTE_ARRAY_OFFSET, sizeInBytes); + Platform.copyMemory(baseObject, baseOffset, bytes, Platform.BYTE_ARRAY_OFFSET, sizeInBytes); return bytes; } } @@ -561,8 +576,7 @@ public byte[] getBytes() { public String toString() { StringBuilder build = new StringBuilder("["); for (int i = 0; i < sizeInBytes; i += 8) { - build.append(java.lang.Long.toHexString( - PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + i))); + build.append(java.lang.Long.toHexString(Platform.getLong(baseObject, baseOffset + i))); build.append(','); } build.append(']'); @@ -580,12 +594,6 @@ public boolean anyNull() { * bytes in this string. */ public void writeToMemory(Object target, long targetOffset) { - PlatformDependent.copyMemory( - baseObject, - baseOffset, - target, - targetOffset, - sizeInBytes - ); + Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes); } } diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java index 31928731545d..2f43db68a750 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java @@ -17,10 +17,11 @@ package org.apache.spark.sql.catalyst.expressions; +import java.math.BigInteger; + import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.types.MapData; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.types.ByteArray; import org.apache.spark.unsafe.types.CalendarInterval; @@ -47,29 +48,38 @@ public static int write(UnsafeRow target, int ordinal, int cursor, Decimal input /** Writer for Decimal with precision larger than 18. */ public static class DecimalWriter { - + private static final int SIZE = 16; public static int getSize(Decimal input) { // bounded size - return 16; + return SIZE; } public static int write(UnsafeRow target, int ordinal, int cursor, Decimal input) { + final Object base = target.getBaseObject(); final long offset = target.getBaseOffset() + cursor; - final byte[] bytes = input.toJavaBigDecimal().unscaledValue().toByteArray(); - final int numBytes = bytes.length; - assert(numBytes <= 16); - // zero-out the bytes - PlatformDependent.UNSAFE.putLong(target.getBaseObject(), offset, 0L); - PlatformDependent.UNSAFE.putLong(target.getBaseObject(), offset + 8, 0L); + Platform.putLong(base, offset, 0L); + Platform.putLong(base, offset + 8, 0L); + + if (input == null) { + target.setNullAt(ordinal); + // keep the offset and length for update + int fieldOffset = UnsafeRow.calculateBitSetWidthInBytes(target.numFields()) + ordinal * 8; + Platform.putLong(base, target.getBaseOffset() + fieldOffset, + ((long) cursor) << 32); + return SIZE; + } - // Write the bytes to the variable length portion. - PlatformDependent.copyMemory(bytes, PlatformDependent.BYTE_ARRAY_OFFSET, - target.getBaseObject(), offset, numBytes); + final BigInteger integer = input.toJavaBigDecimal().unscaledValue(); + byte[] bytes = integer.toByteArray(); + // Write the bytes to the variable length portion. + Platform.copyMemory( + bytes, Platform.BYTE_ARRAY_OFFSET, base, target.getBaseOffset() + cursor, bytes.length); // Set the fixed length portion. - target.setLong(ordinal, (((long) cursor) << 32) | ((long) numBytes)); - return 16; + target.setLong(ordinal, (((long) cursor) << 32) | (long) bytes.length); + + return SIZE; } } @@ -86,8 +96,7 @@ public static int write(UnsafeRow target, int ordinal, int cursor, UTF8String in // zero-out the padding bytes if ((numBytes & 0x07) > 0) { - PlatformDependent.UNSAFE.putLong( - target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L); + Platform.putLong(target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L); } // Write the bytes to the variable length portion. @@ -112,8 +121,7 @@ public static int write(UnsafeRow target, int ordinal, int cursor, byte[] input) // zero-out the padding bytes if ((numBytes & 0x07) > 0) { - PlatformDependent.UNSAFE.putLong( - target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L); + Platform.putLong(target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L); } // Write the bytes to the variable length portion. @@ -154,8 +162,7 @@ public static int write(UnsafeRow target, int ordinal, int cursor, InternalRow i // zero-out the padding bytes if ((numBytes & 0x07) > 0) { - PlatformDependent.UNSAFE.putLong( - target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L); + Platform.putLong(target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L); } // Write the bytes to the variable length portion. @@ -178,8 +185,8 @@ public static int write(UnsafeRow target, int ordinal, int cursor, CalendarInter final long offset = target.getBaseOffset() + cursor; // Write the months and microseconds fields of Interval to the variable length portion. - PlatformDependent.UNSAFE.putLong(target.getBaseObject(), offset, input.months); - PlatformDependent.UNSAFE.putLong(target.getBaseObject(), offset + 8, input.microseconds); + Platform.putLong(target.getBaseObject(), offset, input.months); + Platform.putLong(target.getBaseObject(), offset + 8, input.microseconds); // Set the fixed length portion. target.setLong(ordinal, ((long) cursor) << 32); @@ -199,12 +206,11 @@ public static int write(UnsafeRow target, int ordinal, int cursor, UnsafeArrayDa final long offset = target.getBaseOffset() + cursor; // write the number of elements into first 4 bytes. - PlatformDependent.UNSAFE.putInt(target.getBaseObject(), offset, input.numElements()); + Platform.putInt(target.getBaseObject(), offset, input.numElements()); // zero-out the padding bytes if ((numBytes & 0x07) > 0) { - PlatformDependent.UNSAFE.putLong( - target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L); + Platform.putLong(target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L); } // Write the bytes to the variable length portion. @@ -234,14 +240,13 @@ public static int write(UnsafeRow target, int ordinal, int cursor, UnsafeMapData final int numBytes = 4 + 4 + keysNumBytes + valuesNumBytes; // write the number of elements into first 4 bytes. - PlatformDependent.UNSAFE.putInt(target.getBaseObject(), offset, input.numElements()); + Platform.putInt(target.getBaseObject(), offset, input.numElements()); // write the numBytes of key array into second 4 bytes. - PlatformDependent.UNSAFE.putInt(target.getBaseObject(), offset + 4, keysNumBytes); + Platform.putInt(target.getBaseObject(), offset + 4, keysNumBytes); // zero-out the padding bytes if ((numBytes & 0x07) > 0) { - PlatformDependent.UNSAFE.putLong( - target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L); + Platform.putLong(target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L); } // Write the bytes of key array to the variable length portion. diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java index 0e8e405d055d..cd83695fca03 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java @@ -18,8 +18,7 @@ package org.apache.spark.sql.catalyst.expressions; import org.apache.spark.sql.types.Decimal; -import org.apache.spark.unsafe.PlatformDependent; -import org.apache.spark.unsafe.array.ByteArrayMethods; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.types.CalendarInterval; import org.apache.spark.unsafe.types.UTF8String; @@ -36,17 +35,11 @@ public static void writeToMemory( // zero-out the padding bytes // if ((numBytes & 0x07) > 0) { -// PlatformDependent.UNSAFE.putLong(targetObject, targetOffset + ((numBytes >> 3) << 3), 0L); +// Platform.putLong(targetObject, targetOffset + ((numBytes >> 3) << 3), 0L); // } // Write the UnsafeData to the target memory. - PlatformDependent.copyMemory( - inputObject, - inputOffset, - targetObject, - targetOffset, - numBytes - ); + Platform.copyMemory(inputObject, inputOffset, targetObject, targetOffset, numBytes); } public static int getRoundedSize(int size) { @@ -68,16 +61,11 @@ public static int write(Object targetObject, long targetOffset, Decimal input) { assert(numBytes <= 16); // zero-out the bytes - PlatformDependent.UNSAFE.putLong(targetObject, targetOffset, 0L); - PlatformDependent.UNSAFE.putLong(targetObject, targetOffset + 8, 0L); + Platform.putLong(targetObject, targetOffset, 0L); + Platform.putLong(targetObject, targetOffset + 8, 0L); // Write the bytes to the variable length portion. - PlatformDependent.copyMemory(bytes, - PlatformDependent.BYTE_ARRAY_OFFSET, - targetObject, - targetOffset, - numBytes); - + Platform.copyMemory(bytes, Platform.BYTE_ARRAY_OFFSET, targetObject, targetOffset, numBytes); return 16; } } @@ -111,8 +99,7 @@ public static int write(Object targetObject, long targetOffset, byte[] input) { final int numBytes = input.length; // Write the bytes to the variable length portion. - writeToMemory(input, PlatformDependent.BYTE_ARRAY_OFFSET, - targetObject, targetOffset, numBytes); + writeToMemory(input, Platform.BYTE_ARRAY_OFFSET, targetObject, targetOffset, numBytes); return getRoundedSize(numBytes); } @@ -144,11 +131,9 @@ public static int getSize(UnsafeRow input) { } public static int write(Object targetObject, long targetOffset, CalendarInterval input) { - // Write the months and microseconds fields of Interval to the variable length portion. - PlatformDependent.UNSAFE.putLong(targetObject, targetOffset, input.months); - PlatformDependent.UNSAFE.putLong(targetObject, targetOffset + 8, input.microseconds); - + Platform.putLong(targetObject, targetOffset, input.months); + Platform.putLong(targetObject, targetOffset + 8, input.microseconds); return 16; } } @@ -165,11 +150,11 @@ public static int write(Object targetObject, long targetOffset, UnsafeArrayData final int numBytes = input.getSizeInBytes(); // write the number of elements into first 4 bytes. - PlatformDependent.UNSAFE.putInt(targetObject, targetOffset, input.numElements()); + Platform.putInt(targetObject, targetOffset, input.numElements()); // Write the bytes to the variable length portion. - writeToMemory(input.getBaseObject(), input.getBaseOffset(), - targetObject, targetOffset + 4, numBytes); + writeToMemory( + input.getBaseObject(), input.getBaseOffset(), targetObject, targetOffset + 4, numBytes); return getRoundedSize(numBytes + 4); } @@ -190,9 +175,9 @@ public static int write(Object targetObject, long targetOffset, UnsafeMapData in final int numBytes = 4 + 4 + keysNumBytes + valuesNumBytes; // write the number of elements into first 4 bytes. - PlatformDependent.UNSAFE.putInt(targetObject, targetOffset, input.numElements()); + Platform.putInt(targetObject, targetOffset, input.numElements()); // write the numBytes of key array into second 4 bytes. - PlatformDependent.UNSAFE.putInt(targetObject, targetOffset + 4, keysNumBytes); + Platform.putInt(targetObject, targetOffset + 4, keysNumBytes); // Write the bytes of key array to the variable length portion. writeToMemory(keyArray.getBaseObject(), keyArray.getBaseOffset(), diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java index 5e4c6232c947..1d27182912c8 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java @@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeProjection; import org.apache.spark.sql.catalyst.expressions.UnsafeRow; import org.apache.spark.sql.types.StructType; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.util.collection.unsafe.sort.PrefixComparator; import org.apache.spark.util.collection.unsafe.sort.RecordComparator; import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter; @@ -106,8 +106,15 @@ void spill() throws IOException { sorter.spill(); } + /** + * Return the peak memory used so far, in bytes. + */ + public long getPeakMemoryUsage() { + return sorter.getPeakMemoryUsedBytes(); + } + private void cleanupResources() { - sorter.freeMemory(); + sorter.cleanupResources(); } @VisibleForTesting @@ -150,7 +157,7 @@ public UnsafeRow next() { cleanupResources(); // Scala iterators don't declare any checked exceptions, so we need to use this hack // to re-throw the exception: - PlatformDependent.throwException(e); + Platform.throwException(e); } throw new RuntimeException("Exception should have been re-thrown in next()"); }; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala index 91449479fa53..088b7e121108 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql -import org.apache.spark.sql.catalyst.InternalRow +import scala.util.hashing.MurmurHash3 + import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.StructType @@ -364,31 +365,10 @@ trait Row extends Serializable { false } - /** - * Returns true if we can check equality for these 2 rows. - * Equality check between external row and internal row is not allowed. - * Here we do this check to prevent call `equals` on external row with internal row. - */ - protected def canEqual(other: Row) = { - // Note that `Row` is not only the interface of external row but also the parent - // of `InternalRow`, so we have to ensure `other` is not a internal row here to prevent - // call `equals` on external row with internal row. - // `InternalRow` overrides canEqual, and these two canEquals together makes sure that - // equality check between external Row and InternalRow will always fail. - // In the future, InternalRow should not extend Row. In that case, we can remove these - // canEqual methods. - !other.isInstanceOf[InternalRow] - } - override def equals(o: Any): Boolean = { if (!o.isInstanceOf[Row]) return false val other = o.asInstanceOf[Row] - if (!canEqual(other)) { - throw new UnsupportedOperationException( - "cannot check equality between external and internal rows") - } - if (other eq null) return false if (length != other.length) { @@ -417,6 +397,10 @@ trait Row extends Serializable { if (!o2.isInstanceOf[Double] || ! java.lang.Double.isNaN(o2.asInstanceOf[Double])) { return false } + case d1: java.math.BigDecimal if o2.isInstanceOf[java.math.BigDecimal] => + if (d1.compareTo(o2.asInstanceOf[java.math.BigDecimal]) != 0) { + return false + } case _ => if (o1 != o2) { return false } @@ -427,6 +411,18 @@ trait Row extends Serializable { true } + override def hashCode: Int = { + // Using Scala's Seq hash code implementation. + var n = 0 + var h = MurmurHash3.seqSeed + val len = length + while (n < len) { + h = MurmurHash3.mix(h, apply(n).##) + n += 1 + } + MurmurHash3.finalizeHash(h, n) + } + /* ---------------------- utility methods for Scala ---------------------- */ /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala index d494ae7b71d1..5898a5f93f38 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala @@ -104,6 +104,8 @@ class SqlLexical extends StdLexical { override lazy val token: Parser[Token] = ( identChar ~ (identChar | digit).* ^^ { case first ~ rest => processIdent((first :: rest).mkString) } + | digit.* ~ identChar ~ (identChar | digit).* ^^ + { case first ~ middle ~ rest => processIdent((first ++ (middle :: rest)).mkString) } | rep1(digit) ~ ('.' ~> digit.*).? ^^ { case i ~ None => NumericLit(i.mkString) case i ~ Some(d) => FloatLit(i.mkString + "." + d.mkString) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala index c666864e43ab..966623ed017b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala @@ -317,18 +317,26 @@ object CatalystTypeConverters { private class DecimalConverter(dataType: DecimalType) extends CatalystTypeConverter[Any, JavaBigDecimal, Decimal] { - override def toCatalystImpl(scalaValue: Any): Decimal = scalaValue match { - case d: BigDecimal => Decimal(d) - case d: JavaBigDecimal => Decimal(d) - case d: Decimal => d + override def toCatalystImpl(scalaValue: Any): Decimal = { + val decimal = scalaValue match { + case d: BigDecimal => Decimal(d) + case d: JavaBigDecimal => Decimal(d) + case d: Decimal => d + } + if (decimal.changePrecision(dataType.precision, dataType.scale)) { + decimal + } else { + null + } + } + override def toScala(catalystValue: Decimal): JavaBigDecimal = { + if (catalystValue == null) null + else catalystValue.toJavaBigDecimal } - override def toScala(catalystValue: Decimal): JavaBigDecimal = catalystValue.toJavaBigDecimal override def toScalaImpl(row: InternalRow, column: Int): JavaBigDecimal = row.getDecimal(column, dataType.precision, dataType.scale).toJavaBigDecimal } - private object BigDecimalConverter extends DecimalConverter(DecimalType.SYSTEM_DEFAULT) - private abstract class PrimitiveConverter[T] extends CatalystTypeConverter[T, Any, Any] { final override def toScala(catalystValue: Any): Any = catalystValue final override def toCatalystImpl(scalaValue: T): Any = scalaValue @@ -413,8 +421,8 @@ object CatalystTypeConverters { case s: String => StringConverter.toCatalyst(s) case d: Date => DateConverter.toCatalyst(d) case t: Timestamp => TimestampConverter.toCatalyst(t) - case d: BigDecimal => BigDecimalConverter.toCatalyst(d) - case d: JavaBigDecimal => BigDecimalConverter.toCatalyst(d) + case d: BigDecimal => new DecimalConverter(DecimalType(d.precision, d.scale)).toCatalyst(d) + case d: JavaBigDecimal => new DecimalConverter(DecimalType(d.precision, d.scale)).toCatalyst(d) case seq: Seq[Any] => new GenericArrayData(seq.map(convertToCatalyst).toArray) case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*) case arr: Array[Any] => new GenericArrayData(arr.map(convertToCatalyst)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala index 7656d054dc36..eba95c5c8b90 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala @@ -17,157 +17,60 @@ package org.apache.spark.sql.catalyst -import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.types.{DataType, StructType} /** * An abstract class for row used internal in Spark SQL, which only contain the columns as * internal types. */ -// todo: make InternalRow just extends SpecializedGetters, remove generic getter -abstract class InternalRow extends GenericSpecializedGetters with Serializable { +abstract class InternalRow extends SpecializedGetters with Serializable { def numFields: Int // This is only use for test and will throw a null pointer exception if the position is null. def getString(ordinal: Int): String = getUTF8String(ordinal).toString - override def toString: String = mkString("[", ",", "]") - /** * Make a copy of the current [[InternalRow]] object. */ def copy(): InternalRow /** Returns true if there are any NULL values in this row. */ - def anyNull: Boolean = { - val len = numFields - var i = 0 - while (i < len) { - if (isNullAt(i)) { return true } - i += 1 - } - false - } - - override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[InternalRow]) { - return false - } - - val other = o.asInstanceOf[InternalRow] - if (other eq null) { - return false - } - - val len = numFields - if (len != other.numFields) { - return false - } - - var i = 0 - while (i < len) { - if (isNullAt(i) != other.isNullAt(i)) { - return false - } - if (!isNullAt(i)) { - val o1 = genericGet(i) - val o2 = other.genericGet(i) - o1 match { - case b1: Array[Byte] => - if (!o2.isInstanceOf[Array[Byte]] || - !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) { - return false - } - case f1: Float if java.lang.Float.isNaN(f1) => - if (!o2.isInstanceOf[Float] || ! java.lang.Float.isNaN(o2.asInstanceOf[Float])) { - return false - } - case d1: Double if java.lang.Double.isNaN(d1) => - if (!o2.isInstanceOf[Double] || ! java.lang.Double.isNaN(o2.asInstanceOf[Double])) { - return false - } - case _ => if (o1 != o2) { - return false - } - } - } - i += 1 - } - true - } - - // Custom hashCode function that matches the efficient code generated version. - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numFields - while (i < len) { - val update: Int = - if (isNullAt(i)) { - 0 - } else { - genericGet(i) match { - case b: Boolean => if (b) 0 else 1 - case b: Byte => b.toInt - case s: Short => s.toInt - case i: Int => i - case l: Long => (l ^ (l >>> 32)).toInt - case f: Float => java.lang.Float.floatToIntBits(f) - case d: Double => - val b = java.lang.Double.doubleToLongBits(d) - (b ^ (b >>> 32)).toInt - case a: Array[Byte] => java.util.Arrays.hashCode(a) - case other => other.hashCode() - } - } - result = 37 * result + update - i += 1 - } - result - } + def anyNull: Boolean /* ---------------------- utility methods for Scala ---------------------- */ /** * Return a Scala Seq representing the row. Elements are placed in the same order in the Seq. */ - // todo: remove this as it needs the generic getter - def toSeq: Seq[Any] = { - val n = numFields - val values = new Array[Any](n) + def toSeq(fieldTypes: Seq[DataType]): Seq[Any] = { + val len = numFields + assert(len == fieldTypes.length) + + val values = new Array[Any](len) var i = 0 - while (i < n) { - values.update(i, genericGet(i)) + while (i < len) { + values(i) = get(i, fieldTypes(i)) i += 1 } values } - /** Displays all elements of this sequence in a string (without a separator). */ - def mkString: String = toSeq.mkString - - /** Displays all elements of this sequence in a string using a separator string. */ - def mkString(sep: String): String = toSeq.mkString(sep) - - /** - * Displays all elements of this traversable or iterator in a string using - * start, end, and separator strings. - */ - def mkString(start: String, sep: String, end: String): String = toSeq.mkString(start, sep, end) + def toSeq(schema: StructType): Seq[Any] = toSeq(schema.map(_.dataType)) } object InternalRow { /** - * This method can be used to construct a [[Row]] with the given values. + * This method can be used to construct a [[InternalRow]] with the given values. */ def apply(values: Any*): InternalRow = new GenericInternalRow(values.toArray) /** - * This method can be used to construct a [[Row]] from a [[Seq]] of values. + * This method can be used to construct a [[InternalRow]] from a [[Seq]] of values. */ def fromSeq(values: Seq[Any]): InternalRow = new GenericInternalRow(values.toArray) - /** Returns an empty row. */ + /** Returns an empty [[InternalRow]]. */ val empty = apply() } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala index aebcdeb9d070..d701559bf2d9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala @@ -25,7 +25,9 @@ private[sql] case class TableIdentifier(table: String, database: Option[String] def toSeq: Seq[String] = database.toSeq :+ table - override def toString: String = toSeq.map("`" + _ + "`").mkString(".") + override def toString: String = quotedString + + def quotedString: String = toSeq.map("`" + _ + "`").mkString(".") def unquotedString: String = toSeq.mkString(".") } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index f5daba1543da..1a5de15c61f8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -78,12 +78,13 @@ class Analyzer( ResolveAliases :: ExtractWindowExpressions :: GlobalAggregates :: - UnresolvedHavingClauseAttributes :: - RemoveEvaluationFromSort :: + ResolveAggregateFunctions :: HiveTypeCoercion.typeCoercionRules ++ extendedResolutionRules : _*), Batch("Nondeterministic", Once, - PullOutNondeterministic) + PullOutNondeterministic), + Batch("Cleanup", fixedPoint, + CleanupAliases) ) /** @@ -91,7 +92,7 @@ class Analyzer( */ object CTESubstitution extends Rule[LogicalPlan] { // TODO allow subquery to define CTE - def apply(plan: LogicalPlan): LogicalPlan = plan transform { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case With(child, relations) => substituteCTE(child, relations) case other => other } @@ -117,7 +118,7 @@ class Analyzer( * Substitute child plan with WindowSpecDefinitions. */ object WindowsSubstitution extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transform { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { // Lookup WindowSpecDefinitions. This rule works with unresolved children. case WithWindowDefinition(windowDefinitions, child) => child.transform { @@ -141,14 +142,12 @@ class Analyzer( object ResolveAliases extends Rule[LogicalPlan] { private def assignAliases(exprs: Seq[NamedExpression]) = { // The `UnresolvedAlias`s will appear only at root of a expression tree, we don't need - // to transform down the whole tree. + // to traverse the whole tree. exprs.zipWithIndex.map { case (u @ UnresolvedAlias(child), i) => child match { case _: UnresolvedAttribute => u case ne: NamedExpression => ne - case g: GetStructField => Alias(g, g.field.name)() - case g: GetArrayStructFields => Alias(g, g.field.name)() case g: Generator if g.resolved && g.elementTypes.size > 1 => MultiAlias(g, Nil) case e if !e.resolved => u case other => Alias(other, s"_c$i")() @@ -157,7 +156,7 @@ class Analyzer( } } - def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case Aggregate(groups, aggs, child) if child.resolved && aggs.exists(_.isInstanceOf[UnresolvedAlias]) => Aggregate(groups, assignAliases(aggs), child) @@ -199,7 +198,7 @@ class Analyzer( Seq.tabulate(1 << c.groupByExprs.length)(i => i) } - def apply(plan: LogicalPlan): LogicalPlan = plan transform { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case a if !a.childrenResolved => a // be sure all of the children are resolved. case a: Cube => GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations) @@ -262,7 +261,7 @@ class Analyzer( } } - def apply(plan: LogicalPlan): LogicalPlan = plan transform { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case i @ InsertIntoTable(u: UnresolvedRelation, _, _, _, _) => i.copy(table = EliminateSubQueries(getTable(u))) case u: UnresolvedRelation => @@ -275,7 +274,7 @@ class Analyzer( * a logical plan node's children. */ object ResolveReferences extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case p: LogicalPlan if !p.childrenResolved => p // If the projection list contains Stars, expand it. @@ -385,9 +384,7 @@ class Analyzer( case u @ UnresolvedAttribute(nameParts) => // Leave unchanged if resolution fails. Hopefully will be resolved next round. val result = - withPosition(u) { - q.resolveChildren(nameParts, resolver).map(trimUnresolvedAlias).getOrElse(u) - } + withPosition(u) { q.resolveChildren(nameParts, resolver).getOrElse(u) } logDebug(s"Resolving $u to $result") result case UnresolvedExtractValue(child, fieldExpr) if child.resolved => @@ -409,15 +406,10 @@ class Analyzer( /** * Returns true if `exprs` contains a [[Star]]. */ - protected def containsStar(exprs: Seq[Expression]): Boolean = + def containsStar(exprs: Seq[Expression]): Boolean = exprs.exists(_.collect { case _: Star => true }.nonEmpty) } - private def trimUnresolvedAlias(ne: NamedExpression) = ne match { - case UnresolvedAlias(child) => child - case other => other - } - private def resolveSortOrders(ordering: Seq[SortOrder], plan: LogicalPlan, throws: Boolean) = { ordering.map { order => // Resolve SortOrder in one round. @@ -427,7 +419,7 @@ class Analyzer( try { val newOrder = order transformUp { case u @ UnresolvedAttribute(nameParts) => - plan.resolve(nameParts, resolver).map(trimUnresolvedAlias).getOrElse(u) + plan.resolve(nameParts, resolver).getOrElse(u) case UnresolvedExtractValue(child, fieldName) if child.resolved => ExtractValue(child, fieldName, resolver) } @@ -445,7 +437,7 @@ class Analyzer( * remove these attributes after sorting. */ object ResolveSortReferences extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case s @ Sort(ordering, global, p @ Project(projectList, child)) if !s.resolved && p.resolved => val (newOrdering, missing) = resolveAndFindMissing(ordering, p, child) @@ -460,37 +452,6 @@ class Analyzer( logDebug(s"Failed to find $missing in ${p.output.mkString(", ")}") s // Nothing we can do here. Return original plan. } - case s @ Sort(ordering, global, a @ Aggregate(grouping, aggs, child)) - if !s.resolved && a.resolved => - // A small hack to create an object that will allow us to resolve any references that - // refer to named expressions that are present in the grouping expressions. - val groupingRelation = LocalRelation( - grouping.collect { case ne: NamedExpression => ne.toAttribute } - ) - - // Find sort attributes that are projected away so we can temporarily add them back in. - val (newOrdering, missingAttr) = resolveAndFindMissing(ordering, a, groupingRelation) - - // Find aggregate expressions and evaluate them early, since they can't be evaluated in a - // Sort. - val (withAggsRemoved, aliasedAggregateList) = newOrdering.map { - case aggOrdering if aggOrdering.collect { case a: AggregateExpression => a }.nonEmpty => - val aliased = Alias(aggOrdering.child, "_aggOrdering")() - (aggOrdering.copy(child = aliased.toAttribute), Some(aliased)) - - case other => (other, None) - }.unzip - - val missing = missingAttr ++ aliasedAggregateList.flatten - - if (missing.nonEmpty) { - // Add missing grouping exprs and then project them away after the sort. - Project(a.output, - Sort(withAggsRemoved, global, - Aggregate(grouping, aggs ++ missing, child))) - } else { - s // Nothing we can do here. Return original plan. - } } /** @@ -520,9 +481,10 @@ class Analyzer( * Replaces [[UnresolvedFunction]]s with concrete [[Expression]]s. */ object ResolveFunctions extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transform { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case q: LogicalPlan => q transformExpressions { + case u if !u.childrenResolved => u // Skip until children are resolved. case u @ UnresolvedFunction(name, children, isDistinct) => withPosition(u) { registry.lookupFunction(name, children) match { @@ -552,7 +514,7 @@ class Analyzer( * Turns projections that contain aggregate expressions into aggregations. */ object GlobalAggregates extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transform { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case Project(projectList, child) if containsAggregates(projectList) => Aggregate(Nil, projectList, child) } @@ -567,21 +529,79 @@ class Analyzer( } /** - * This rule finds expressions in HAVING clause filters that depend on - * unresolved attributes. It pushes these expressions down to the underlying - * aggregates and then projects them away above the filter. + * This rule finds aggregate expressions that are not in an aggregate operator. For example, + * those in a HAVING clause or ORDER BY clause. These expressions are pushed down to the + * underlying aggregate operator and then projected away after the original operator. */ - object UnresolvedHavingClauseAttributes extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { - case filter @ Filter(havingCondition, aggregate @ Aggregate(_, originalAggExprs, _)) - if aggregate.resolved && containsAggregate(havingCondition) => - - val evaluatedCondition = Alias(havingCondition, "havingCondition")() - val aggExprsWithHaving = evaluatedCondition +: originalAggExprs + object ResolveAggregateFunctions extends Rule[LogicalPlan] { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case filter @ Filter(havingCondition, + aggregate @ Aggregate(grouping, originalAggExprs, child)) + if aggregate.resolved && !filter.resolved => + + // Try resolving the condition of the filter as though it is in the aggregate clause + val aggregatedCondition = + Aggregate(grouping, Alias(havingCondition, "havingCondition")() :: Nil, child) + val resolvedOperator = execute(aggregatedCondition) + def resolvedAggregateFilter = + resolvedOperator + .asInstanceOf[Aggregate] + .aggregateExpressions.head + + // If resolution was successful and we see the filter has an aggregate in it, add it to + // the original aggregate operator. + if (resolvedOperator.resolved && containsAggregate(resolvedAggregateFilter)) { + val aggExprsWithHaving = resolvedAggregateFilter +: originalAggExprs + + Project(aggregate.output, + Filter(resolvedAggregateFilter.toAttribute, + aggregate.copy(aggregateExpressions = aggExprsWithHaving))) + } else { + filter + } - Project(aggregate.output, - Filter(evaluatedCondition.toAttribute, - aggregate.copy(aggregateExpressions = aggExprsWithHaving))) + case sort @ Sort(sortOrder, global, + aggregate @ Aggregate(grouping, originalAggExprs, child)) + if aggregate.resolved && !sort.resolved => + + // Try resolving the ordering as though it is in the aggregate clause. + try { + val aliasedOrder = sortOrder.map(o => Alias(o.child, "aggOrder")()) + val aggregatedOrdering = Aggregate(grouping, aliasedOrder, child) + val resolvedOperator: Aggregate = execute(aggregatedOrdering).asInstanceOf[Aggregate] + def resolvedAggregateOrdering = resolvedOperator.aggregateExpressions + + // Expressions that have an aggregate can be pushed down. + val needsAggregate = resolvedAggregateOrdering.exists(containsAggregate) + + // Attribute references, that are missing from the order but are present in the grouping + // expressions can also be pushed down. + val requiredAttributes = resolvedAggregateOrdering.map(_.references).reduce(_ ++ _) + val missingAttributes = requiredAttributes -- aggregate.outputSet + val validPushdownAttributes = + missingAttributes.filter(a => grouping.exists(a.semanticEquals)) + + // If resolution was successful and we see the ordering either has an aggregate in it or + // it is missing something that is projected away by the aggregate, add the ordering + // the original aggregate operator. + if (resolvedOperator.resolved && (needsAggregate || validPushdownAttributes.nonEmpty)) { + val evaluatedOrderings: Seq[SortOrder] = sortOrder.zip(resolvedAggregateOrdering).map { + case (order, evaluated) => order.copy(child = evaluated.toAttribute) + } + val aggExprsWithOrdering: Seq[NamedExpression] = + resolvedAggregateOrdering ++ originalAggExprs + + Project(aggregate.output, + Sort(evaluatedOrderings, global, + aggregate.copy(aggregateExpressions = aggExprsWithOrdering))) + } else { + sort + } + } catch { + // Attempting to resolve in the aggregate can result in ambiguity. When this happens, + // just return the original plan. + case ae: AnalysisException => sort + } } protected def containsAggregate(condition: Expression): Boolean = { @@ -602,7 +622,9 @@ class Analyzer( * [[AnalysisException]] is throw. */ object ResolveGenerate extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transform { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case g: Generate if ResolveReferences.containsStar(g.generator.children) => + failAnalysis("Cannot explode *, explode can only be applied on a specific column.") case p: Generate if !p.child.resolved || !p.generator.resolved => p case g: Generate if !g.resolved => g.copy(generatorOutput = makeGeneratorOutput(g.generator, g.generatorOutput.map(_.name))) @@ -873,6 +895,7 @@ class Analyzer( // We have to use transformDown at here to make sure the rule of // "Aggregate with Having clause" will be triggered. def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { + // Aggregate with Having clause. This rule works with an unresolved Aggregate because // a resolved Aggregate will not have Window Functions. case f @ Filter(condition, a @ Aggregate(groupingExprs, aggregateExprs, child)) @@ -928,7 +951,8 @@ class Analyzer( * put them into an inner Project and finally project them away at the outer Project. */ object PullOutNondeterministic extends Rule[LogicalPlan] { - override def apply(plan: LogicalPlan): LogicalPlan = plan transform { + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case p if !p.resolved => p // Skip unresolved nodes. case p: Project => p case f: Filter => f @@ -955,63 +979,6 @@ class Analyzer( Project(p.output, newPlan.withNewChildren(newChild :: Nil)) } } - - /** - * Removes all still-need-evaluate ordering expressions from sort and use an inner project to - * materialize them, finally use a outer project to project them away to keep the result same. - * Then we can make sure we only sort by [[AttributeReference]]s. - * - * As an example, - * {{{ - * Sort('a, 'b + 1, - * Relation('a, 'b)) - * }}} - * will be turned into: - * {{{ - * Project('a, 'b, - * Sort('a, '_sortCondition, - * Project('a, 'b, ('b + 1).as("_sortCondition"), - * Relation('a, 'b)))) - * }}} - */ - object RemoveEvaluationFromSort extends Rule[LogicalPlan] { - private def hasAlias(expr: Expression) = { - expr.find { - case a: Alias => true - case _ => false - }.isDefined - } - - override def apply(plan: LogicalPlan): LogicalPlan = plan transform { - // The ordering expressions have no effect to the output schema of `Sort`, - // so `Alias`s in ordering expressions are unnecessary and we should remove them. - case s @ Sort(ordering, _, _) if ordering.exists(hasAlias) => - val newOrdering = ordering.map(_.transformUp { - case Alias(child, _) => child - }.asInstanceOf[SortOrder]) - s.copy(order = newOrdering) - - case s @ Sort(ordering, global, child) - if s.expressions.forall(_.resolved) && s.childrenResolved && !s.hasNoEvaluation => - - val (ref, needEval) = ordering.partition(_.child.isInstanceOf[AttributeReference]) - - val namedExpr = needEval.map(_.child match { - case n: NamedExpression => n - case e => Alias(e, "_sortCondition")() - }) - - val newOrdering = ref ++ needEval.zip(namedExpr).map { case (order, ne) => - order.copy(child = ne.toAttribute) - } - - // Add still-need-evaluate ordering expressions into inner project and then project - // them away after the sort. - Project(child.output, - Sort(newOrdering, global, - Project(child.output ++ namedExpr, child))) - } - } } /** @@ -1023,3 +990,61 @@ object EliminateSubQueries extends Rule[LogicalPlan] { case Subquery(_, child) => child } } + +/** + * Cleans up unnecessary Aliases inside the plan. Basically we only need Alias as a top level + * expression in Project(project list) or Aggregate(aggregate expressions) or + * Window(window expressions). + */ +object CleanupAliases extends Rule[LogicalPlan] { + private def trimAliases(e: Expression): Expression = { + var stop = false + e.transformDown { + // CreateStruct is a special case, we need to retain its top level Aliases as they decide the + // name of StructField. We also need to stop transform down this expression, or the Aliases + // under CreateStruct will be mistakenly trimmed. + case c: CreateStruct if !stop => + stop = true + c.copy(children = c.children.map(trimNonTopLevelAliases)) + case c: CreateStructUnsafe if !stop => + stop = true + c.copy(children = c.children.map(trimNonTopLevelAliases)) + case Alias(child, _) if !stop => child + } + } + + def trimNonTopLevelAliases(e: Expression): Expression = e match { + case a: Alias => + Alias(trimAliases(a.child), a.name)(a.exprId, a.qualifiers, a.explicitMetadata) + case other => trimAliases(other) + } + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case Project(projectList, child) => + val cleanedProjectList = + projectList.map(trimNonTopLevelAliases(_).asInstanceOf[NamedExpression]) + Project(cleanedProjectList, child) + + case Aggregate(grouping, aggs, child) => + val cleanedAggs = aggs.map(trimNonTopLevelAliases(_).asInstanceOf[NamedExpression]) + Aggregate(grouping.map(trimAliases), cleanedAggs, child) + + case w @ Window(projectList, windowExprs, partitionSpec, orderSpec, child) => + val cleanedWindowExprs = + windowExprs.map(e => trimNonTopLevelAliases(e).asInstanceOf[NamedExpression]) + Window(projectList, cleanedWindowExprs, partitionSpec.map(trimAliases), + orderSpec.map(trimAliases(_).asInstanceOf[SortOrder]), child) + + case other => + var stop = false + other transformExpressionsDown { + case c: CreateStruct if !stop => + stop = true + c.copy(children = c.children.map(trimNonTopLevelAliases)) + case c: CreateStructUnsafe if !stop => + stop = true + c.copy(children = c.children.map(trimNonTopLevelAliases)) + case Alias(child, _) if !stop => child + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala index 5766e6a2dd51..503c4f4b20f3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala @@ -23,6 +23,7 @@ import scala.collection.JavaConversions._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{TableIdentifier, CatalystConf, EmptyConf} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery} @@ -55,12 +56,15 @@ trait Catalog { def refreshTable(tableIdent: TableIdentifier): Unit + // TODO: Refactor it in the work of SPARK-10104 def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit + // TODO: Refactor it in the work of SPARK-10104 def unregisterTable(tableIdentifier: Seq[String]): Unit def unregisterAllTables(): Unit + // TODO: Refactor it in the work of SPARK-10104 protected def processTableIdentifier(tableIdentifier: Seq[String]): Seq[String] = { if (conf.caseSensitiveAnalysis) { tableIdentifier @@ -69,6 +73,7 @@ trait Catalog { } } + // TODO: Refactor it in the work of SPARK-10104 protected def getDbTableName(tableIdent: Seq[String]): String = { val size = tableIdent.size if (size <= 2) { @@ -78,9 +83,22 @@ trait Catalog { } } + // TODO: Refactor it in the work of SPARK-10104 protected def getDBTable(tableIdent: Seq[String]) : (Option[String], String) = { (tableIdent.lift(tableIdent.size - 2), tableIdent.last) } + + /** + * It is not allowed to specifiy database name for tables stored in [[SimpleCatalog]]. + * We use this method to check it. + */ + protected def checkTableIdentifier(tableIdentifier: Seq[String]): Unit = { + if (tableIdentifier.length > 1) { + throw new AnalysisException("Specifying database name or other qualifiers are not allowed " + + "for temporary tables. If the table name has dots (.) in it, please quote the " + + "table name with backticks (`).") + } + } } class SimpleCatalog(val conf: CatalystConf) extends Catalog { @@ -89,11 +107,13 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog { override def registerTable( tableIdentifier: Seq[String], plan: LogicalPlan): Unit = { + checkTableIdentifier(tableIdentifier) val tableIdent = processTableIdentifier(tableIdentifier) tables.put(getDbTableName(tableIdent), plan) } override def unregisterTable(tableIdentifier: Seq[String]): Unit = { + checkTableIdentifier(tableIdentifier) val tableIdent = processTableIdentifier(tableIdentifier) tables.remove(getDbTableName(tableIdent)) } @@ -103,6 +123,7 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog { } override def tableExists(tableIdentifier: Seq[String]): Boolean = { + checkTableIdentifier(tableIdentifier) val tableIdent = processTableIdentifier(tableIdentifier) tables.containsKey(getDbTableName(tableIdent)) } @@ -110,6 +131,7 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog { override def lookupRelation( tableIdentifier: Seq[String], alias: Option[String] = None): LogicalPlan = { + checkTableIdentifier(tableIdentifier) val tableIdent = processTableIdentifier(tableIdentifier) val tableFullName = getDbTableName(tableIdent) val table = tables.get(tableFullName) @@ -149,7 +171,13 @@ trait OverrideCatalog extends Catalog { abstract override def tableExists(tableIdentifier: Seq[String]): Boolean = { val tableIdent = processTableIdentifier(tableIdentifier) - overrides.get(getDBTable(tableIdent)) match { + // A temporary tables only has a single part in the tableIdentifier. + val overriddenTable = if (tableIdentifier.length > 1) { + None: Option[LogicalPlan] + } else { + overrides.get(getDBTable(tableIdent)) + } + overriddenTable match { case Some(_) => true case None => super.tableExists(tableIdentifier) } @@ -159,7 +187,12 @@ trait OverrideCatalog extends Catalog { tableIdentifier: Seq[String], alias: Option[String] = None): LogicalPlan = { val tableIdent = processTableIdentifier(tableIdentifier) - val overriddenTable = overrides.get(getDBTable(tableIdent)) + // A temporary tables only has a single part in the tableIdentifier. + val overriddenTable = if (tableIdentifier.length > 1) { + None: Option[LogicalPlan] + } else { + overrides.get(getDBTable(tableIdent)) + } val tableWithQualifers = overriddenTable.map(r => Subquery(tableIdent.last, r)) // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are @@ -171,20 +204,8 @@ trait OverrideCatalog extends Catalog { } abstract override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = { - val dbName = if (conf.caseSensitiveAnalysis) { - databaseName - } else { - if (databaseName.isDefined) Some(databaseName.get.toLowerCase) else None - } - - val temporaryTables = overrides.filter { - // If a temporary table does not have an associated database, we should return its name. - case ((None, _), _) => true - // If a temporary table does have an associated database, we should return it if the database - // matches the given database name. - case ((db: Some[String], _), _) if db == dbName => true - case _ => false - }.map { + // We always return all temporary tables. + val temporaryTables = overrides.map { case ((_, tableName), _) => (tableName, true) }.toSeq @@ -194,13 +215,19 @@ trait OverrideCatalog extends Catalog { override def registerTable( tableIdentifier: Seq[String], plan: LogicalPlan): Unit = { + checkTableIdentifier(tableIdentifier) val tableIdent = processTableIdentifier(tableIdentifier) overrides.put(getDBTable(tableIdent), plan) } override def unregisterTable(tableIdentifier: Seq[String]): Unit = { - val tableIdent = processTableIdentifier(tableIdentifier) - overrides.remove(getDBTable(tableIdent)) + // A temporary tables only has a single part in the tableIdentifier. + // If tableIdentifier has more than one parts, it is not a temporary table + // and we do not need to do anything at here. + if (tableIdentifier.length == 1) { + val tableIdent = processTableIdentifier(tableIdentifier) + overrides.remove(getDBTable(tableIdent)) + } } override def unregisterAllTables(): Unit = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 187b238045f8..7701fd045104 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -47,6 +47,7 @@ trait CheckAnalysis { // We transform up and order the rules so as to catch the first possible failure instead // of the result of cascading resolution failures. plan.foreachUp { + case p if p.analyzed => // Skip already analyzed sub-plans case operator: LogicalPlan => operator transformExpressionsUp { @@ -136,6 +137,12 @@ trait CheckAnalysis { } } + case s @ SetOperation(left, right) if left.output.length != right.output.length => + failAnalysis( + s"${s.nodeName} can only be performed on tables with the same number of columns, " + + s"but the left table has ${left.output.length} columns and the right has " + + s"${right.output.length}") + case _ => // Fallbacks to the following checks } @@ -179,5 +186,7 @@ trait CheckAnalysis { } } extendedCheckRules.foreach(_(plan)) + + plan.foreach(_.setAnalyzed()) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index bc0846646174..cd5a90d78815 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -177,7 +177,9 @@ object FunctionRegistry { expression[ConcatWs]("concat_ws"), expression[Encode]("encode"), expression[Decode]("decode"), + expression[FindInSet]("find_in_set"), expression[FormatNumber]("format_number"), + expression[GetJsonObject]("get_json_object"), expression[InitCap]("initcap"), expression[Lower]("lcase"), expression[Lower]("lower"), @@ -201,6 +203,7 @@ object FunctionRegistry { expression[Substring]("substr"), expression[Substring]("substring"), expression[SubstringIndex]("substring_index"), + expression[StringTranslate]("translate"), expression[StringTrim]("trim"), expression[UnBase64]("unbase64"), expression[Upper]("ucase"), @@ -238,6 +241,7 @@ object FunctionRegistry { // collection functions expression[Size]("size"), expression[SortArray]("sort_array"), + expression[ArrayContains]("array_contains"), // misc functions expression[Crc32]("crc32"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index 422d42374702..87c11abbad49 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -144,7 +144,8 @@ object HiveTypeCoercion { * instances higher in the query tree. */ object PropagateTypes extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transform { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + // No propagation required for leaf nodes. case q: LogicalPlan if q.children.isEmpty => q @@ -163,7 +164,7 @@ object HiveTypeCoercion { // Leave the same if the dataTypes match. case Some(newType) if a.dataType == newType.dataType => a case Some(newType) => - logDebug(s"Promoting $a to $newType in ${q.simpleString}}") + logDebug(s"Promoting $a to $newType in ${q.simpleString}") newType } } @@ -202,6 +203,7 @@ object HiveTypeCoercion { planName: String, left: LogicalPlan, right: LogicalPlan): (LogicalPlan, LogicalPlan) = { + require(left.output.length == right.output.length) val castedTypes = left.output.zip(right.output).map { case (lhs, rhs) if lhs.dataType != rhs.dataType => @@ -225,16 +227,13 @@ object HiveTypeCoercion { } } - def apply(plan: LogicalPlan): LogicalPlan = plan transform { - case u @ Union(left, right) if u.childrenResolved && !u.resolved => - val (newLeft, newRight) = widenOutputTypes(u.nodeName, left, right) - Union(newLeft, newRight) - case e @ Except(left, right) if e.childrenResolved && !e.resolved => - val (newLeft, newRight) = widenOutputTypes(e.nodeName, left, right) - Except(newLeft, newRight) - case i @ Intersect(left, right) if i.childrenResolved && !i.resolved => - val (newLeft, newRight) = widenOutputTypes(i.nodeName, left, right) - Intersect(newLeft, newRight) + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case p if p.analyzed => p + + case s @ SetOperation(left, right) if s.childrenResolved + && left.output.length == right.output.length && !s.resolved => + val (newLeft, newRight) = widenOutputTypes(s.nodeName, left, right) + s.makeCopy(Array(newLeft, newRight)) } } @@ -242,7 +241,7 @@ object HiveTypeCoercion { * Promotes strings that appear in arithmetic expressions. */ object PromoteStrings extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e @@ -305,7 +304,7 @@ object HiveTypeCoercion { * Convert all expressions in in() list to the left operator type */ object InConversion extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e @@ -368,47 +367,60 @@ object HiveTypeCoercion { DecimalType.bounded(range + scale, scale) } - private def changePrecision(e: Expression, dataType: DataType): Expression = { - ChangeDecimalPrecision(Cast(e, dataType)) + private def promotePrecision(e: Expression, dataType: DataType): Expression = { + PromotePrecision(Cast(e, dataType)) } - def apply(plan: LogicalPlan): LogicalPlan = plan transform { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + // fix decimal precision for expressions case q => q.transformExpressions { // Skip nodes whose children have not been resolved yet case e if !e.childrenResolved => e // Skip nodes who is already promoted - case e: BinaryArithmetic if e.left.isInstanceOf[ChangeDecimalPrecision] => e + case e: BinaryArithmetic if e.left.isInstanceOf[PromotePrecision] => e case Add(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) => val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2)) - Add(changePrecision(e1, dt), changePrecision(e2, dt)) + CheckOverflow(Add(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt) case Subtract(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) => val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2)) - Subtract(changePrecision(e1, dt), changePrecision(e2, dt)) + CheckOverflow(Subtract(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt) case Multiply(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) => - val dt = DecimalType.bounded(p1 + p2 + 1, s1 + s2) - Multiply(changePrecision(e1, dt), changePrecision(e2, dt)) + val resultType = DecimalType.bounded(p1 + p2 + 1, s1 + s2) + val widerType = widerDecimalType(p1, s1, p2, s2) + CheckOverflow(Multiply(promotePrecision(e1, widerType), promotePrecision(e2, widerType)), + resultType) case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) => - val dt = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 1), max(6, s1 + p2 + 1)) - Divide(changePrecision(e1, dt), changePrecision(e2, dt)) + var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2) + var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1)) + val diff = (intDig + decDig) - DecimalType.MAX_SCALE + if (diff > 0) { + decDig -= diff / 2 + 1 + intDig = DecimalType.MAX_SCALE - decDig + } + val resultType = DecimalType.bounded(intDig + decDig, decDig) + val widerType = widerDecimalType(p1, s1, p2, s2) + CheckOverflow(Divide(promotePrecision(e1, widerType), promotePrecision(e2, widerType)), + resultType) case Remainder(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) => val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2)) // resultType may have lower precision, so we cast them into wider type first. val widerType = widerDecimalType(p1, s1, p2, s2) - Cast(Remainder(changePrecision(e1, widerType), changePrecision(e2, widerType)), + CheckOverflow(Remainder(promotePrecision(e1, widerType), promotePrecision(e2, widerType)), resultType) case Pmod(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) => val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2)) // resultType may have lower precision, so we cast them into wider type first. val widerType = widerDecimalType(p1, s1, p2, s2) - Cast(Pmod(changePrecision(e1, widerType), changePrecision(e2, widerType)), resultType) + CheckOverflow(Pmod(promotePrecision(e1, widerType), promotePrecision(e2, widerType)), + resultType) case b @ BinaryComparison(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 => @@ -442,8 +454,8 @@ object HiveTypeCoercion { * Changes numeric values to booleans so that expressions like true = 1 can be evaluated. */ object BooleanEquality extends Rule[LogicalPlan] { - private val trueValues = Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1)) - private val falseValues = Seq(0.toByte, 0.toShort, 0, 0L, Decimal(0)) + private val trueValues = Seq(1.toByte, 1.toShort, 1, 1L, Decimal.ONE) + private val falseValues = Seq(0.toByte, 0.toShort, 0, 0L, Decimal.ZERO) private def buildCaseKeyWhen(booleanExpr: Expression, numericExpr: Expression) = { CaseKeyWhen(numericExpr, Seq( @@ -466,7 +478,7 @@ object HiveTypeCoercion { )) } - def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e @@ -508,7 +520,7 @@ object HiveTypeCoercion { * truncated version of this number. */ object StringToIntegralCasts extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e @@ -521,7 +533,7 @@ object HiveTypeCoercion { * This ensure that the types for various functions are as expected. */ object FunctionArgumentConversion extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e @@ -575,7 +587,7 @@ object HiveTypeCoercion { * converted to fractional types. */ object Division extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { // Skip nodes who has not been resolved yet, // as this is an extra rule which should be applied at last. case e if !e.resolved => e @@ -592,10 +604,10 @@ object HiveTypeCoercion { * Coerces the type of different branches of a CASE WHEN statement to a common type. */ object CaseWhenCoercion extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { case c: CaseWhenLike if c.childrenResolved && !c.valueTypesEqual => logDebug(s"Input values for null casting ${c.valueTypes.mkString(",")}") - val maybeCommonType = findTightestCommonTypeAndPromoteToString(c.valueTypes) + val maybeCommonType = findWiderCommonType(c.valueTypes) maybeCommonType.map { commonType => val castedBranches = c.branches.grouped(2).map { case Seq(when, value) if value.dataType != commonType => @@ -612,7 +624,7 @@ object HiveTypeCoercion { case c: CaseKeyWhen if c.childrenResolved && !c.resolved => val maybeCommonType = - findTightestCommonTypeAndPromoteToString((c.key +: c.whenList).map(_.dataType)) + findWiderCommonType((c.key +: c.whenList).map(_.dataType)) maybeCommonType.map { commonType => val castedBranches = c.branches.grouped(2).map { case Seq(whenExpr, thenExpr) if whenExpr.dataType != commonType => @@ -628,7 +640,8 @@ object HiveTypeCoercion { * Coerces the type of different branches of If statement to a common type. */ object IfCoercion extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { + case e if !e.childrenResolved => e // Find tightest common type for If, if the true value and false value have different types. case i @ If(pred, left, right) if left.dataType != right.dataType => findTightestCommonTypeToString(left.dataType, right.dataType).map { widestType => @@ -652,7 +665,7 @@ object HiveTypeCoercion { private val acceptedTypes = Seq(DateType, TimestampType, StringType) - def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e @@ -669,7 +682,7 @@ object HiveTypeCoercion { * Casts types according to the expected input types for [[Expression]]s. */ object ImplicitTypeCasts extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 03da45b09f92..43ee3191935e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.errors import org.apache.spark.sql.catalyst.expressions._ @@ -69,8 +70,64 @@ case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute with Un } object UnresolvedAttribute { + /** + * Creates an [[UnresolvedAttribute]], parsing segments separated by dots ('.'). + */ def apply(name: String): UnresolvedAttribute = new UnresolvedAttribute(name.split("\\.")) + + /** + * Creates an [[UnresolvedAttribute]], from a single quoted string (for example using backticks in + * HiveQL. Since the string is consider quoted, no processing is done on the name. + */ def quoted(name: String): UnresolvedAttribute = new UnresolvedAttribute(Seq(name)) + + /** + * Creates an [[UnresolvedAttribute]] from a string in an embedded language. In this case + * we treat it as a quoted identifier, except for '.', which must be further quoted using + * backticks if it is part of a column name. + */ + def quotedString(name: String): UnresolvedAttribute = + new UnresolvedAttribute(parseAttributeName(name)) + + /** + * Used to split attribute name by dot with backticks rule. + * Backticks must appear in pairs, and the quoted string must be a complete name part, + * which means `ab..c`e.f is not allowed. + * Escape character is not supported now, so we can't use backtick inside name part. + */ + def parseAttributeName(name: String): Seq[String] = { + def e = new AnalysisException(s"syntax error in attribute name: $name") + val nameParts = scala.collection.mutable.ArrayBuffer.empty[String] + val tmp = scala.collection.mutable.ArrayBuffer.empty[Char] + var inBacktick = false + var i = 0 + while (i < name.length) { + val char = name(i) + if (inBacktick) { + if (char == '`') { + inBacktick = false + if (i + 1 < name.length && name(i + 1) != '.') throw e + } else { + tmp += char + } + } else { + if (char == '`') { + if (tmp.nonEmpty) throw e + inBacktick = true + } else if (char == '.') { + if (name(i - 1) == '.' || i == name.length - 1) throw e + nameParts += tmp.mkString + tmp.clear() + } else { + tmp += char + } + } + i += 1 + } + if (inBacktick) throw e + nameParts += tmp.mkString + nameParts.toSeq + } } case class UnresolvedFunction( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 88429bb84b1e..2db954257be3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -26,8 +26,6 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} -import scala.collection.mutable - object Cast { @@ -109,6 +107,8 @@ object Cast { case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with CodegenFallback { + override def toString: String = s"cast($child as ${dataType.simpleString})" + override def checkInputDataTypes(): TypeCheckResult = { if (Cast.canCast(child.dataType, dataType)) { TypeCheckResult.TypeCheckSuccess @@ -120,8 +120,6 @@ case class Cast(child: Expression, dataType: DataType) override def nullable: Boolean = Cast.forceNullable(child.dataType, dataType) || child.nullable - override def toString: String = s"CAST($child, $dataType)" - // [[func]] assumes the input is no longer null because eval already does the null check. @inline private[this] def buildCast[T](a: Any, func: T => Any): Any = func(a.asInstanceOf[T]) @@ -157,7 +155,7 @@ case class Cast(child: Expression, dataType: DataType) case ByteType => buildCast[Byte](_, _ != 0) case DecimalType() => - buildCast[Decimal](_, _ != Decimal(0)) + buildCast[Decimal](_, !_.isZero) case DoubleType => buildCast[Double](_, _ != 0) case FloatType => @@ -311,19 +309,19 @@ case class Cast(child: Expression, dataType: DataType) case _: NumberFormatException => null }) case BooleanType => - buildCast[Boolean](_, b => changePrecision(if (b) Decimal(1) else Decimal(0), target)) + buildCast[Boolean](_, b => changePrecision(if (b) Decimal.ONE else Decimal.ZERO, target)) case DateType => buildCast[Int](_, d => null) // date can't cast to decimal in Hive case TimestampType => // Note that we lose precision here. buildCast[Long](_, t => changePrecision(Decimal(timestampToDouble(t)), target)) - case DecimalType() => + case dt: DecimalType => b => changePrecision(b.asInstanceOf[Decimal].clone(), target) - case LongType => - b => changePrecision(Decimal(b.asInstanceOf[Long]), target) - case x: NumericType => // All other numeric types can be represented precisely as Doubles + case t: IntegralType => + b => changePrecision(Decimal(t.integral.asInstanceOf[Integral[Any]].toLong(b)), target) + case x: FractionalType => b => try { - changePrecision(Decimal(x.numeric.asInstanceOf[Numeric[Any]].toDouble(b)), target) + changePrecision(Decimal(x.fractional.asInstanceOf[Fractional[Any]].toDouble(b)), target) } catch { case _: NumberFormatException => null } @@ -449,7 +447,7 @@ case class Cast(child: Expression, dataType: DataType) case StringType => castToStringCode(from, ctx) case BinaryType => castToBinaryCode(from) case DateType => castToDateCode(from, ctx) - case decimal: DecimalType => castToDecimalCode(from, decimal) + case decimal: DecimalType => castToDecimalCode(from, decimal, ctx) case TimestampType => castToTimestampCode(from, ctx) case CalendarIntervalType => castToIntervalCode(from) case BooleanType => castToBooleanCode(from) @@ -530,17 +528,18 @@ case class Cast(child: Expression, dataType: DataType) } """ - private[this] def castToDecimalCode(from: DataType, target: DecimalType): CastFunction = { + private[this] def castToDecimalCode( + from: DataType, + target: DecimalType, + ctx: CodeGenContext): CastFunction = { + val tmp = ctx.freshName("tmpDecimal") from match { case StringType => (c, evPrim, evNull) => s""" try { - org.apache.spark.sql.types.Decimal tmpDecimal = - new org.apache.spark.sql.types.Decimal().set( - new scala.math.BigDecimal( - new java.math.BigDecimal($c.toString()))); - ${changePrecision("tmpDecimal", target, evPrim, evNull)} + Decimal $tmp = Decimal.apply(new java.math.BigDecimal($c.toString())); + ${changePrecision(tmp, target, evPrim, evNull)} } catch (java.lang.NumberFormatException e) { $evNull = true; } @@ -548,13 +547,8 @@ case class Cast(child: Expression, dataType: DataType) case BooleanType => (c, evPrim, evNull) => s""" - org.apache.spark.sql.types.Decimal tmpDecimal = null; - if ($c) { - tmpDecimal = new org.apache.spark.sql.types.Decimal().set(1); - } else { - tmpDecimal = new org.apache.spark.sql.types.Decimal().set(0); - } - ${changePrecision("tmpDecimal", target, evPrim, evNull)} + Decimal $tmp = $c ? Decimal.apply(1) : Decimal.apply(0); + ${changePrecision(tmp, target, evPrim, evNull)} """ case DateType => // date can't cast to decimal in Hive @@ -563,33 +557,29 @@ case class Cast(child: Expression, dataType: DataType) // Note that we lose precision here. (c, evPrim, evNull) => s""" - org.apache.spark.sql.types.Decimal tmpDecimal = - new org.apache.spark.sql.types.Decimal().set( - scala.math.BigDecimal.valueOf(${timestampToDoubleCode(c)})); - ${changePrecision("tmpDecimal", target, evPrim, evNull)} + Decimal $tmp = Decimal.apply( + scala.math.BigDecimal.valueOf(${timestampToDoubleCode(c)})); + ${changePrecision(tmp, target, evPrim, evNull)} """ case DecimalType() => (c, evPrim, evNull) => s""" - org.apache.spark.sql.types.Decimal tmpDecimal = $c.clone(); - ${changePrecision("tmpDecimal", target, evPrim, evNull)} + Decimal $tmp = $c.clone(); + ${changePrecision(tmp, target, evPrim, evNull)} """ - case LongType => + case x: IntegralType => (c, evPrim, evNull) => s""" - org.apache.spark.sql.types.Decimal tmpDecimal = - new org.apache.spark.sql.types.Decimal().set($c); - ${changePrecision("tmpDecimal", target, evPrim, evNull)} + Decimal $tmp = Decimal.apply((long) $c); + ${changePrecision(tmp, target, evPrim, evNull)} """ - case x: NumericType => + case x: FractionalType => // All other numeric types can be represented precisely as Doubles (c, evPrim, evNull) => s""" try { - org.apache.spark.sql.types.Decimal tmpDecimal = - new org.apache.spark.sql.types.Decimal().set( - scala.math.BigDecimal.valueOf((double) $c)); - ${changePrecision("tmpDecimal", target, evPrim, evNull)} + Decimal $tmp = Decimal.apply(scala.math.BigDecimal.valueOf((double) $c)); + ${changePrecision(tmp, target, evPrim, evNull)} } catch (java.lang.NumberFormatException e) { $evNull = true; } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index ef2fc2e8c29d..0b98f555a1d6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -444,7 +444,7 @@ abstract class TernaryExpression extends Expression { override def nullable: Boolean = children.exists(_.nullable) /** - * Default behavior of evaluation according to the default nullability of BinaryExpression. + * Default behavior of evaluation according to the default nullability of TernaryExpression. * If subclass of BinaryExpression override nullable, probably should also override this. */ override def eval(input: InternalRow): Any = { @@ -463,7 +463,7 @@ abstract class TernaryExpression extends Expression { } /** - * Called by default [[eval]] implementation. If subclass of BinaryExpression keep the default + * Called by default [[eval]] implementation. If subclass of TernaryExpression keep the default * nullability, they can override this method to save null-check code. If we need full control * of evaluation process, we should override [[eval]]. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FromUnsafe.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FromUnsafe.scala deleted file mode 100644 index 3caf0fb3410c..000000000000 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FromUnsafe.scala +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.expressions - -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.types._ - -case class FromUnsafe(child: Expression) extends UnaryExpression - with ExpectsInputTypes with CodegenFallback { - - override def inputTypes: Seq[AbstractDataType] = - Seq(TypeCollection(ArrayType, StructType, MapType)) - - override def dataType: DataType = child.dataType - - private def convert(value: Any, dt: DataType): Any = dt match { - case StructType(fields) => - val row = value.asInstanceOf[UnsafeRow] - val result = new Array[Any](fields.length) - fields.map(_.dataType).zipWithIndex.foreach { case (dt, i) => - if (!row.isNullAt(i)) { - result(i) = convert(row.get(i, dt), dt) - } - } - new GenericInternalRow(result) - - case ArrayType(elementType, _) => - val array = value.asInstanceOf[UnsafeArrayData] - val length = array.numElements() - val result = new Array[Any](length) - var i = 0 - while (i < length) { - if (!array.isNullAt(i)) { - result(i) = convert(array.get(i, elementType), elementType) - } - i += 1 - } - new GenericArrayData(result) - - case MapType(kt, vt, _) => - val map = value.asInstanceOf[UnsafeMapData] - val safeKeyArray = convert(map.keys, ArrayType(kt)).asInstanceOf[GenericArrayData] - val safeValueArray = convert(map.values, ArrayType(vt)).asInstanceOf[GenericArrayData] - new ArrayBasedMapData(safeKeyArray, safeValueArray) - - case _ => value - } - - override def nullSafeEval(input: Any): Any = { - convert(input, dataType) - } -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/GenericSpecializedGetters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/GenericSpecializedGetters.scala deleted file mode 100644 index 6e957928e02a..000000000000 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/GenericSpecializedGetters.scala +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.expressions - -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.types.{DataType, MapData, ArrayData, Decimal} -import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} - -trait GenericSpecializedGetters extends SpecializedGetters { - - def genericGet(ordinal: Int): Any - - private def getAs[T](ordinal: Int) = genericGet(ordinal).asInstanceOf[T] - - override def isNullAt(ordinal: Int): Boolean = getAs[AnyRef](ordinal) eq null - - override def get(ordinal: Int, elementType: DataType): AnyRef = getAs(ordinal) - - override def getBoolean(ordinal: Int): Boolean = getAs(ordinal) - - override def getByte(ordinal: Int): Byte = getAs(ordinal) - - override def getShort(ordinal: Int): Short = getAs(ordinal) - - override def getInt(ordinal: Int): Int = getAs(ordinal) - - override def getLong(ordinal: Int): Long = getAs(ordinal) - - override def getFloat(ordinal: Int): Float = getAs(ordinal) - - override def getDouble(ordinal: Int): Double = getAs(ordinal) - - override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = getAs(ordinal) - - override def getUTF8String(ordinal: Int): UTF8String = getAs(ordinal) - - override def getBinary(ordinal: Int): Array[Byte] = getAs(ordinal) - - override def getInterval(ordinal: Int): CalendarInterval = getAs(ordinal) - - override def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs(ordinal) - - override def getArray(ordinal: Int): ArrayData = getAs(ordinal) - - override def getMap(ordinal: Int): MapData = getAs(ordinal) -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/JoinedRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/JoinedRow.scala new file mode 100644 index 000000000000..d3560df0792e --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/JoinedRow.scala @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} + + +/** + * A mutable wrapper that makes two rows appear as a single concatenated row. Designed to + * be instantiated once per thread and reused. + */ +class JoinedRow extends InternalRow { + private[this] var row1: InternalRow = _ + private[this] var row2: InternalRow = _ + + def this(left: InternalRow, right: InternalRow) = { + this() + row1 = left + row2 = right + } + + /** Updates this JoinedRow to used point at two new base rows. Returns itself. */ + def apply(r1: InternalRow, r2: InternalRow): JoinedRow = { + row1 = r1 + row2 = r2 + this + } + + /** Updates this JoinedRow by updating its left base row. Returns itself. */ + def withLeft(newLeft: InternalRow): JoinedRow = { + row1 = newLeft + this + } + + /** Updates this JoinedRow by updating its right base row. Returns itself. */ + def withRight(newRight: InternalRow): JoinedRow = { + row2 = newRight + this + } + + override def toSeq(fieldTypes: Seq[DataType]): Seq[Any] = { + assert(fieldTypes.length == row1.numFields + row2.numFields) + val (left, right) = fieldTypes.splitAt(row1.numFields) + row1.toSeq(left) ++ row2.toSeq(right) + } + + override def numFields: Int = row1.numFields + row2.numFields + + override def get(i: Int, dt: DataType): AnyRef = + if (i < row1.numFields) row1.get(i, dt) else row2.get(i - row1.numFields, dt) + + override def isNullAt(i: Int): Boolean = + if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields) + + override def getBoolean(i: Int): Boolean = + if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields) + + override def getByte(i: Int): Byte = + if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields) + + override def getShort(i: Int): Short = + if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields) + + override def getInt(i: Int): Int = + if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields) + + override def getLong(i: Int): Long = + if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields) + + override def getFloat(i: Int): Float = + if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields) + + override def getDouble(i: Int): Double = + if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields) + + override def getDecimal(i: Int, precision: Int, scale: Int): Decimal = { + if (i < row1.numFields) { + row1.getDecimal(i, precision, scale) + } else { + row2.getDecimal(i - row1.numFields, precision, scale) + } + } + + override def getUTF8String(i: Int): UTF8String = + if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields) + + override def getBinary(i: Int): Array[Byte] = + if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields) + + override def getArray(i: Int): ArrayData = + if (i < row1.numFields) row1.getArray(i) else row2.getArray(i - row1.numFields) + + override def getInterval(i: Int): CalendarInterval = + if (i < row1.numFields) row1.getInterval(i) else row2.getInterval(i - row1.numFields) + + override def getMap(i: Int): MapData = + if (i < row1.numFields) row1.getMap(i) else row2.getMap(i - row1.numFields) + + override def getStruct(i: Int, numFields: Int): InternalRow = { + if (i < row1.numFields) { + row1.getStruct(i, numFields) + } else { + row2.getStruct(i - row1.numFields, numFields) + } + } + + override def anyNull: Boolean = row1.anyNull || row2.anyNull + + override def copy(): InternalRow = { + val copy1 = row1.copy() + val copy2 = row2.copy() + new JoinedRow(copy1, copy2) + } + + override def toString: String = { + // Make sure toString never throws NullPointerException. + if ((row1 eq null) && (row2 eq null)) { + "[ empty row ]" + } else if (row1 eq null) { + row2.toString + } else if (row2 eq null) { + row1.toString + } else { + s"{${row1.toString} + ${row2.toString}}" + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala index 79649741025a..afe52e6a667e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala @@ -152,13 +152,7 @@ object FromUnsafeProjection { */ def apply(fields: Seq[DataType]): Projection = { create(fields.zipWithIndex.map(x => { - val b = new BoundReference(x._2, x._1, true) - // todo: this is quite slow, maybe remove this whole projection after remove generic getter of - // InternalRow? - b.dataType match { - case _: StructType | _: ArrayType | _: MapType => FromUnsafe(b) - case _ => b - } + new BoundReference(x._2, x._1, true) })) } @@ -169,118 +163,3 @@ object FromUnsafeProjection { GenerateSafeProjection.generate(exprs) } } - -/** - * A mutable wrapper that makes two rows appear as a single concatenated row. Designed to - * be instantiated once per thread and reused. - */ -class JoinedRow extends InternalRow { - private[this] var row1: InternalRow = _ - private[this] var row2: InternalRow = _ - - def this(left: InternalRow, right: InternalRow) = { - this() - row1 = left - row2 = right - } - - /** Updates this JoinedRow to used point at two new base rows. Returns itself. */ - def apply(r1: InternalRow, r2: InternalRow): InternalRow = { - row1 = r1 - row2 = r2 - this - } - - /** Updates this JoinedRow by updating its left base row. Returns itself. */ - def withLeft(newLeft: InternalRow): InternalRow = { - row1 = newLeft - this - } - - /** Updates this JoinedRow by updating its right base row. Returns itself. */ - def withRight(newRight: InternalRow): InternalRow = { - row2 = newRight - this - } - - override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq - - override def numFields: Int = row1.numFields + row2.numFields - - override def genericGet(i: Int): Any = - if (i < row1.numFields) row1.genericGet(i) else row2.genericGet(i - row1.numFields) - - override def isNullAt(i: Int): Boolean = - if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields) - - override def getBoolean(i: Int): Boolean = - if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields) - - override def getByte(i: Int): Byte = - if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields) - - override def getShort(i: Int): Short = - if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields) - - override def getInt(i: Int): Int = - if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields) - - override def getLong(i: Int): Long = - if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields) - - override def getFloat(i: Int): Float = - if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields) - - override def getDouble(i: Int): Double = - if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields) - - override def getDecimal(i: Int, precision: Int, scale: Int): Decimal = { - if (i < row1.numFields) { - row1.getDecimal(i, precision, scale) - } else { - row2.getDecimal(i - row1.numFields, precision, scale) - } - } - - override def getUTF8String(i: Int): UTF8String = - if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields) - - override def getBinary(i: Int): Array[Byte] = - if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields) - - override def getArray(i: Int): ArrayData = - if (i < row1.numFields) row1.getArray(i) else row2.getArray(i - row1.numFields) - - override def getInterval(i: Int): CalendarInterval = - if (i < row1.numFields) row1.getInterval(i) else row2.getInterval(i - row1.numFields) - - override def getMap(i: Int): MapData = - if (i < row1.numFields) row1.getMap(i) else row2.getMap(i - row1.numFields) - - override def getStruct(i: Int, numFields: Int): InternalRow = { - if (i < row1.numFields) { - row1.getStruct(i, numFields) - } else { - row2.getStruct(i - row1.numFields, numFields) - } - } - - override def copy(): InternalRow = { - val copy1 = row1.copy() - val copy2 = row2.copy() - new JoinedRow(copy1, copy2) - } - - override def toString: String = { - // Make sure toString never throws NullPointerException. - if ((row1 eq null) && (row2 eq null)) { - "[ empty row ]" - } else if (row1 eq null) { - row2.mkString("[", ",", "]") - } else if (row2 eq null) { - row1.mkString("[", ",", "]") - } else { - mkString("[", ",", "]") - } - } -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala index f6a872ba446e..98e029035ab6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext} import org.apache.spark.sql.types._ +import org.apache.spark.util.collection.unsafe.sort.PrefixComparators.BinaryPrefixComparator import org.apache.spark.util.collection.unsafe.sort.PrefixComparators.DoublePrefixComparator abstract sealed class SortDirection @@ -63,6 +64,7 @@ case class SortPrefix(child: SortOrder) extends UnaryExpression { override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val childCode = child.child.gen(ctx) val input = childCode.primitive + val BinaryPrefixCmp = classOf[BinaryPrefixComparator].getName val DoublePrefixCmp = classOf[DoublePrefixComparator].getName val (nullValue: Long, prefixCode: String) = child.child.dataType match { @@ -76,6 +78,7 @@ case class SortPrefix(child: SortOrder) extends UnaryExpression { (DoublePrefixComparator.computePrefix(Double.NegativeInfinity), s"$DoublePrefixCmp.computePrefix((double)$input)") case StringType => (0L, s"$input.getPrefix()") + case BinaryType => (0L, s"$BinaryPrefixCmp.computePrefix($input)") case dt: DecimalType if dt.precision - dt.scale <= Decimal.MAX_LONG_DIGITS => val prefix = if (dt.precision <= Decimal.MAX_LONG_DIGITS) { s"$input.toUnscaledLong()" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala index d149a5b179b6..4f56f94bd4ca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala @@ -192,7 +192,8 @@ final class MutableAny extends MutableValue { * based on the dataTypes of each column. The intent is to decrease garbage when modifying the * values of primitive columns. */ -final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableRow { +final class SpecificMutableRow(val values: Array[MutableValue]) + extends MutableRow with BaseGenericInternalRow { def this(dataTypes: Seq[DataType]) = this( @@ -213,8 +214,6 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR override def numFields: Int = values.length - override def toSeq: Seq[Any] = values.map(_.boxed) - override def setNullAt(i: Int): Unit = { values(i).isNull = true } @@ -232,7 +231,7 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR new GenericInternalRow(newValues) } - override def genericGet(i: Int): Any = values(i).boxed + override protected def genericGet(i: Int): Any = values(i).boxed override def update(ordinal: Int, value: Any) { if (value == null) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala index 88fb516e64aa..a73024d6adba 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala @@ -31,8 +31,11 @@ case class Average(child: Expression) extends AlgebraicAggregate { override def dataType: DataType = resultType // Expected input data type. - // TODO: Once we remove the old code path, we can use our analyzer to cast NullType - // to the default data type of the NumericType. + // TODO: Right now, we replace old aggregate functions (based on AggregateExpression1) to the + // new version at planning time (after analysis phase). For now, NullType is added at here + // to make it resolved when we have cases like `select avg(null)`. + // We can use our analyzer to cast NullType to the default data type of the NumericType once + // we remove the old aggregate functions. Then, we will not need NullType at here. override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(NumericType, NullType)) private val resultType = child.dataType match { @@ -256,12 +259,19 @@ case class Sum(child: Expression) extends AlgebraicAggregate { override def dataType: DataType = resultType // Expected input data type. + // TODO: Right now, we replace old aggregate functions (based on AggregateExpression1) to the + // new version at planning time (after analysis phase). For now, NullType is added at here + // to make it resolved when we have cases like `select sum(null)`. + // We can use our analyzer to cast NullType to the default data type of the NumericType once + // we remove the old aggregate functions. Then, we will not need NullType at here. override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(LongType, DoubleType, DecimalType, NullType)) private val resultType = child.dataType match { case DecimalType.Fixed(precision, scale) => DecimalType.bounded(precision + 10, scale) + // TODO: Remove this line once we remove the NullType from inputTypes. + case NullType => IntegerType case _ => child.dataType } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala index 4abfdfe87d5e..576d8c7a3a68 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala @@ -93,7 +93,7 @@ private[sql] case class AggregateExpression2( AttributeSet(childReferences) } - override def toString: String = s"(${aggregateFunction}2,mode=$mode,isDistinct=$isDistinct)" + override def toString: String = s"(${aggregateFunction},mode=$mode,isDistinct=$isDistinct)" } abstract class AggregateFunction2 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala index 5d4b349b1597..5e8298aaaa9c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala @@ -623,7 +623,7 @@ case class CombineSetsAndSumFunction( null } else { Cast(Literal( - casted.iterator.map(f => f.genericGet(0)).reduceLeft( + casted.iterator.map(f => f.get(0, null)).reduceLeft( base.dataType.asInstanceOf[NumericType].numeric.asInstanceOf[Numeric[Any]].plus)), base.dataType).eval(null) } @@ -650,6 +650,7 @@ case class FirstFunction(expr: Expression, base: AggregateExpression1) extends A var result: Any = null override def update(input: InternalRow): Unit = { + // We ignore null values. if (result == null) { result = expr.eval(input) } @@ -679,10 +680,14 @@ case class LastFunction(expr: Expression, base: AggregateExpression1) extends Ag var result: Any = null override def update(input: InternalRow): Unit = { - result = input + val value = expr.eval(input) + // We ignore null values. + if (value != null) { + result = value + } } override def eval(input: InternalRow): Any = { - if (result != null) expr.eval(result.asInstanceOf[InternalRow]) else null + result } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala index 0891b5549471..98464edf4d39 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala @@ -320,7 +320,7 @@ case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic { override def nullable: Boolean = left.nullable && right.nullable - private lazy val ordering = TypeUtils.getOrdering(dataType) + private lazy val ordering = TypeUtils.getInterpretedOrdering(dataType) override def eval(input: InternalRow): Any = { val input1 = left.eval(input) @@ -374,7 +374,7 @@ case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic { override def nullable: Boolean = left.nullable && right.nullable - private lazy val ordering = TypeUtils.getOrdering(dataType) + private lazy val ordering = TypeUtils.getInterpretedOrdering(dataType) override def eval(input: InternalRow): Any = { val input1 = left.eval(input) @@ -511,6 +511,6 @@ case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic { private def pmod(a: Decimal, n: Decimal): Decimal = { val r = a % n - if (r.compare(Decimal(0)) < 0) {(r + n) % n} else r + if (r.compare(Decimal.ZERO) < 0) {(r + n) % n} else r } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala similarity index 100% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 7b41c9a3f3b8..bf96248feaef 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.expressions.codegen import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer import scala.language.existentials import com.google.common.cache.{CacheBuilder, CacheLoader} @@ -27,7 +28,7 @@ import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.PlatformDependent +import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.types._ @@ -265,6 +266,45 @@ class CodeGenContext { def isPrimitiveType(jt: String): Boolean = primitiveTypes.contains(jt) def isPrimitiveType(dt: DataType): Boolean = isPrimitiveType(javaType(dt)) + + /** + * Splits the generated code of expressions into multiple functions, because function has + * 64kb code size limit in JVM + * + * @param row the variable name of row that is used by expressions + */ + def splitExpressions(row: String, expressions: Seq[String]): String = { + val blocks = new ArrayBuffer[String]() + val blockBuilder = new StringBuilder() + for (code <- expressions) { + // We can't know how many byte code will be generated, so use the number of bytes as limit + if (blockBuilder.length > 64 * 1000) { + blocks.append(blockBuilder.toString()) + blockBuilder.clear() + } + blockBuilder.append(code) + } + blocks.append(blockBuilder.toString()) + + if (blocks.length == 1) { + // inline execution if only one block + blocks.head + } else { + val apply = freshName("apply") + val functions = blocks.zipWithIndex.map { case (body, i) => + val name = s"${apply}_$i" + val code = s""" + |private void $name(InternalRow $row) { + | $body + |} + """.stripMargin + addNewFunction(name, code) + name + } + + functions.map(name => s"$name($row);").mkString("\n") + } + } } /** @@ -289,15 +329,15 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin protected def declareMutableStates(ctx: CodeGenContext): String = { ctx.mutableStates.map { case (javaType, variableName, _) => s"private $javaType $variableName;" - }.mkString + }.mkString("\n") } protected def initMutableStates(ctx: CodeGenContext): String = { - ctx.mutableStates.map(_._3).mkString + ctx.mutableStates.map(_._3).mkString("\n") } protected def declareAddedFunctions(ctx: CodeGenContext): String = { - ctx.addedFuntions.map { case (funcName, funcCode) => funcCode }.mkString + ctx.addedFuntions.map { case (funcName, funcCode) => funcCode }.mkString("\n") } /** @@ -328,8 +368,10 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin private[this] def doCompile(code: String): GeneratedClass = { val evaluator = new ClassBodyEvaluator() evaluator.setParentClassLoader(getClass.getClassLoader) + // Cannot be under package codegen, or fail with java.lang.InstantiationException + evaluator.setClassName("org.apache.spark.sql.catalyst.expressions.GeneratedClass") evaluator.setDefaultImports(Array( - classOf[PlatformDependent].getName, + classOf[Platform].getName, classOf[InternalRow].getName, classOf[UnsafeRow].getName, classOf[UTF8String].getName, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala index e4a8fc24dac2..b4d4df8934bd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala @@ -21,6 +21,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp +import org.apache.spark.sql.types.DecimalType // MutableProjection is not accessible in Java abstract class BaseMutableProjection extends MutableProjection @@ -39,62 +40,46 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu protected def create(expressions: Seq[Expression]): (() => MutableProjection) = { val ctx = newCodeGenContext() - val projectionCode = expressions.zipWithIndex.map { + val projectionCodes = expressions.zipWithIndex.map { case (NoOp, _) => "" case (e, i) => val evaluationCode = e.gen(ctx) - evaluationCode.code + + if (e.dataType.isInstanceOf[DecimalType]) { + // Can't call setNullAt on DecimalType, because we need to keep the offset s""" + ${evaluationCode.code} + if (${evaluationCode.isNull}) { + ${ctx.setColumn("mutableRow", e.dataType, i, null)}; + } else { + ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)}; + } + """ + } else { + s""" + ${evaluationCode.code} if (${evaluationCode.isNull}) { mutableRow.setNullAt($i); } else { ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)}; } """ + } } - // collect projections into blocks as function has 64kb codesize limit in JVM - val projectionBlocks = new ArrayBuffer[String]() - val blockBuilder = new StringBuilder() - for (projection <- projectionCode) { - if (blockBuilder.length > 16 * 1000) { - projectionBlocks.append(blockBuilder.toString()) - blockBuilder.clear() - } - blockBuilder.append(projection) - } - projectionBlocks.append(blockBuilder.toString()) - - val (projectionFuns, projectionCalls) = { - // inline execution if codesize limit was not broken - if (projectionBlocks.length == 1) { - ("", projectionBlocks.head) - } else { - ( - projectionBlocks.zipWithIndex.map { case (body, i) => - s""" - |private void apply$i(InternalRow i) { - | $body - |} - """.stripMargin - }.mkString, - projectionBlocks.indices.map(i => s"apply$i(i);").mkString("\n") - ) - } - } + val allProjections = ctx.splitExpressions("i", projectionCodes) val code = s""" public Object generate($exprType[] expr) { - return new SpecificProjection(expr); + return new SpecificMutableProjection(expr); } - class SpecificProjection extends ${classOf[BaseMutableProjection].getName} { + class SpecificMutableProjection extends ${classOf[BaseMutableProjection].getName} { private $exprType[] expressions; private $mutableRowType mutableRow; ${declareMutableStates(ctx)} ${declareAddedFunctions(ctx)} - public SpecificProjection($exprType[] expr) { + public SpecificMutableProjection($exprType[] expr) { expressions = expr; mutableRow = new $genericMutableRowType(${expressions.size}); ${initMutableStates(ctx)} @@ -110,12 +95,9 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu return (InternalRow) mutableRow; } - $projectionFuns - public Object apply(Object _i) { InternalRow i = (InternalRow) _i; - $projectionCalls - + $allProjections return mutableRow; } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala index 1572b2b99ab6..c744e84d822e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.expressions.codegen +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ @@ -25,6 +26,8 @@ import org.apache.spark.sql.types._ */ abstract class BaseProjection extends Projection {} +abstract class CodeGenMutableRow extends MutableRow with BaseGenericInternalRow + /** * Generates bytecode that produces a new [[InternalRow]] object based on a fixed set of input * [[Expression Expressions]] and a given input [[InternalRow]]. The returned [[InternalRow]] @@ -171,7 +174,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] { return new SpecificRow((InternalRow) r); } - final class SpecificRow extends ${classOf[MutableRow].getName} { + final class SpecificRow extends ${classOf[CodeGenMutableRow].getName} { $columns @@ -184,6 +187,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] { public void setNullAt(int i) { nullBits[i] = true; } public boolean isNullAt(int i) { return nullBits[i]; } + @Override public Object genericGet(int i) { if (isNullAt(i)) return null; switch (i) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala index f06ffc5449e7..7ad352d7ce3e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala @@ -17,11 +17,9 @@ package org.apache.spark.sql.catalyst.expressions.codegen -import scala.collection.mutable.ArrayBuffer - import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp -import org.apache.spark.sql.types.{StringType, StructType, DataType} +import org.apache.spark.sql.types._ /** @@ -36,81 +34,117 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection] protected def bind(in: Seq[Expression], inputSchema: Seq[Attribute]): Seq[Expression] = in.map(BindReferences.bindReference(_, inputSchema)) - private def genUpdater( + private def createCodeForStruct( ctx: CodeGenContext, - setter: String, - dataType: DataType, - ordinal: Int, - value: String): String = { - dataType match { - case struct: StructType => - val rowTerm = ctx.freshName("row") - val updates = struct.map(_.dataType).zipWithIndex.map { case (dt, i) => - val colTerm = ctx.freshName("col") - s""" - if ($value.isNullAt($i)) { - $rowTerm.setNullAt($i); - } else { - ${ctx.javaType(dt)} $colTerm = ${ctx.getValue(value, dt, s"$i")}; - ${genUpdater(ctx, rowTerm, dt, i, colTerm)}; - } - """ - }.mkString("\n") - s""" - $genericMutableRowType $rowTerm = new $genericMutableRowType(${struct.fields.length}); - $updates - $setter.update($ordinal, $rowTerm.copy()); - """ - case _ => - ctx.setColumn(setter, dataType, ordinal, value) + input: String, + schema: StructType): GeneratedExpressionCode = { + val tmp = ctx.freshName("tmp") + val output = ctx.freshName("safeRow") + val values = ctx.freshName("values") + // These expressions could be splitted into multiple functions + ctx.addMutableState("Object[]", values, s"this.$values = null;") + + val rowClass = classOf[GenericInternalRow].getName + + val fieldWriters = schema.map(_.dataType).zipWithIndex.map { case (dt, i) => + val converter = convertToSafe(ctx, ctx.getValue(tmp, dt, i.toString), dt) + s""" + if (!$tmp.isNullAt($i)) { + ${converter.code} + $values[$i] = ${converter.primitive}; + } + """ } + val allFields = ctx.splitExpressions(tmp, fieldWriters) + val code = s""" + final InternalRow $tmp = $input; + this.$values = new Object[${schema.length}]; + $allFields + final InternalRow $output = new $rowClass($values); + """ + + GeneratedExpressionCode(code, "false", output) + } + + private def createCodeForArray( + ctx: CodeGenContext, + input: String, + elementType: DataType): GeneratedExpressionCode = { + val tmp = ctx.freshName("tmp") + val output = ctx.freshName("safeArray") + val values = ctx.freshName("values") + val numElements = ctx.freshName("numElements") + val index = ctx.freshName("index") + val arrayClass = classOf[GenericArrayData].getName + + val elementConverter = convertToSafe(ctx, ctx.getValue(tmp, elementType, index), elementType) + val code = s""" + final ArrayData $tmp = $input; + final int $numElements = $tmp.numElements(); + final Object[] $values = new Object[$numElements]; + for (int $index = 0; $index < $numElements; $index++) { + if (!$tmp.isNullAt($index)) { + ${elementConverter.code} + $values[$index] = ${elementConverter.primitive}; + } + } + final ArrayData $output = new $arrayClass($values); + """ + + GeneratedExpressionCode(code, "false", output) + } + + private def createCodeForMap( + ctx: CodeGenContext, + input: String, + keyType: DataType, + valueType: DataType): GeneratedExpressionCode = { + val tmp = ctx.freshName("tmp") + val output = ctx.freshName("safeMap") + val mapClass = classOf[ArrayBasedMapData].getName + + val keyConverter = createCodeForArray(ctx, s"$tmp.keyArray()", keyType) + val valueConverter = createCodeForArray(ctx, s"$tmp.valueArray()", valueType) + val code = s""" + final MapData $tmp = $input; + ${keyConverter.code} + ${valueConverter.code} + final MapData $output = new $mapClass(${keyConverter.primitive}, ${valueConverter.primitive}); + """ + + GeneratedExpressionCode(code, "false", output) + } + + private def convertToSafe( + ctx: CodeGenContext, + input: String, + dataType: DataType): GeneratedExpressionCode = dataType match { + case s: StructType => createCodeForStruct(ctx, input, s) + case ArrayType(elementType, _) => createCodeForArray(ctx, input, elementType) + case MapType(keyType, valueType, _) => createCodeForMap(ctx, input, keyType, valueType) + // UTF8String act as a pointer if it's inside UnsafeRow, so copy it to make it safe. + case StringType => GeneratedExpressionCode("", "false", s"$input.clone()") + case _ => GeneratedExpressionCode("", "false", input) } protected def create(expressions: Seq[Expression]): Projection = { val ctx = newCodeGenContext() - val projectionCode = expressions.zipWithIndex.map { + val expressionCodes = expressions.zipWithIndex.map { case (NoOp, _) => "" case (e, i) => val evaluationCode = e.gen(ctx) + val converter = convertToSafe(ctx, evaluationCode.primitive, e.dataType) evaluationCode.code + s""" if (${evaluationCode.isNull}) { mutableRow.setNullAt($i); } else { - ${genUpdater(ctx, "mutableRow", e.dataType, i, evaluationCode.primitive)}; + ${converter.code} + ${ctx.setColumn("mutableRow", e.dataType, i, converter.primitive)}; } """ } - // collect projections into blocks as function has 64kb codesize limit in JVM - val projectionBlocks = new ArrayBuffer[String]() - val blockBuilder = new StringBuilder() - for (projection <- projectionCode) { - if (blockBuilder.length > 16 * 1000) { - projectionBlocks.append(blockBuilder.toString()) - blockBuilder.clear() - } - blockBuilder.append(projection) - } - projectionBlocks.append(blockBuilder.toString()) - - val (projectionFuns, projectionCalls) = { - // inline it if we have only one block - if (projectionBlocks.length == 1) { - ("", projectionBlocks.head) - } else { - ( - projectionBlocks.zipWithIndex.map { case (body, i) => - s""" - |private void apply$i(InternalRow i) { - | $body - |} - """.stripMargin - }.mkString, - projectionBlocks.indices.map(i => s"apply$i(i);").mkString("\n") - ) - } - } - + val allExpressions = ctx.splitExpressions("i", expressionCodes) val code = s""" public Object generate($exprType[] expr) { return new SpecificSafeProjection(expr); @@ -121,6 +155,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection] private $exprType[] expressions; private $mutableRowType mutableRow; ${declareMutableStates(ctx)} + ${declareAddedFunctions(ctx)} public SpecificSafeProjection($exprType[] expr) { expressions = expr; @@ -128,12 +163,9 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection] ${initMutableStates(ctx)} } - $projectionFuns - public Object apply(Object _i) { InternalRow i = (InternalRow) _i; - $projectionCalls - + $allExpressions return mutableRow; } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala index fc3ecf545142..b570fe86db1a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.PlatformDependent /** * Generates a [[Projection]] that returns an [[UnsafeRow]]. @@ -41,14 +40,12 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro private val ArrayWriter = classOf[UnsafeRowWriters.ArrayWriter].getName private val MapWriter = classOf[UnsafeRowWriters.MapWriter].getName - private val PlatformDependent = classOf[PlatformDependent].getName - /** Returns true iff we support this data type. */ def canSupport(dataType: DataType): Boolean = dataType match { + case NullType => true case t: AtomicType => true case _: CalendarIntervalType => true case t: StructType => t.toSeq.forall(field => canSupport(field.dataType)) - case NullType => true case t: ArrayType if canSupport(t.elementType) => true case MapType(kt, vt, _) if canSupport(kt) && canSupport(vt) => true case _ => false @@ -56,19 +53,19 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro def genAdditionalSize(dt: DataType, ev: GeneratedExpressionCode): String = dt match { case t: DecimalType if t.precision > Decimal.MAX_LONG_DIGITS => - s" + (${ev.isNull} ? 0 : $DecimalWriter.getSize(${ev.primitive}))" + s"$DecimalWriter.getSize(${ev.primitive})" case StringType => - s" + (${ev.isNull} ? 0 : $StringWriter.getSize(${ev.primitive}))" + s"${ev.isNull} ? 0 : $StringWriter.getSize(${ev.primitive})" case BinaryType => - s" + (${ev.isNull} ? 0 : $BinaryWriter.getSize(${ev.primitive}))" + s"${ev.isNull} ? 0 : $BinaryWriter.getSize(${ev.primitive})" case CalendarIntervalType => - s" + (${ev.isNull} ? 0 : 16)" + s"${ev.isNull} ? 0 : 16" case _: StructType => - s" + (${ev.isNull} ? 0 : $StructWriter.getSize(${ev.primitive}))" + s"${ev.isNull} ? 0 : $StructWriter.getSize(${ev.primitive})" case _: ArrayType => - s" + (${ev.isNull} ? 0 : $ArrayWriter.getSize(${ev.primitive}))" + s"${ev.isNull} ? 0 : $ArrayWriter.getSize(${ev.primitive})" case _: MapType => - s" + (${ev.isNull} ? 0 : $MapWriter.getSize(${ev.primitive}))" + s"${ev.isNull} ? 0 : $MapWriter.getSize(${ev.primitive})" case _ => "" } @@ -76,194 +73,46 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro ctx: CodeGenContext, fieldType: DataType, ev: GeneratedExpressionCode, - primitive: String, + target: String, index: Int, cursor: String): String = fieldType match { case _ if ctx.isPrimitiveType(fieldType) => - s"${ctx.setColumn(primitive, fieldType, index, ev.primitive)}" + s"${ctx.setColumn(target, fieldType, index, ev.primitive)}" case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS => s""" // make sure Decimal object has the same scale as DecimalType if (${ev.primitive}.changePrecision(${t.precision}, ${t.scale})) { - $CompactDecimalWriter.write($primitive, $index, $cursor, ${ev.primitive}); + $CompactDecimalWriter.write($target, $index, $cursor, ${ev.primitive}); } else { - $primitive.setNullAt($index); + $target.setNullAt($index); } """ case t: DecimalType if t.precision > Decimal.MAX_LONG_DIGITS => s""" // make sure Decimal object has the same scale as DecimalType if (${ev.primitive}.changePrecision(${t.precision}, ${t.scale})) { - $cursor += $DecimalWriter.write($primitive, $index, $cursor, ${ev.primitive}); + $cursor += $DecimalWriter.write($target, $index, $cursor, ${ev.primitive}); } else { - $primitive.setNullAt($index); + $cursor += $DecimalWriter.write($target, $index, $cursor, null); } """ case StringType => - s"$cursor += $StringWriter.write($primitive, $index, $cursor, ${ev.primitive})" + s"$cursor += $StringWriter.write($target, $index, $cursor, ${ev.primitive})" case BinaryType => - s"$cursor += $BinaryWriter.write($primitive, $index, $cursor, ${ev.primitive})" + s"$cursor += $BinaryWriter.write($target, $index, $cursor, ${ev.primitive})" case CalendarIntervalType => - s"$cursor += $IntervalWriter.write($primitive, $index, $cursor, ${ev.primitive})" + s"$cursor += $IntervalWriter.write($target, $index, $cursor, ${ev.primitive})" case _: StructType => - s"$cursor += $StructWriter.write($primitive, $index, $cursor, ${ev.primitive})" + s"$cursor += $StructWriter.write($target, $index, $cursor, ${ev.primitive})" case _: ArrayType => - s"$cursor += $ArrayWriter.write($primitive, $index, $cursor, ${ev.primitive})" + s"$cursor += $ArrayWriter.write($target, $index, $cursor, ${ev.primitive})" case _: MapType => - s"$cursor += $MapWriter.write($primitive, $index, $cursor, ${ev.primitive})" + s"$cursor += $MapWriter.write($target, $index, $cursor, ${ev.primitive})" case NullType => "" case _ => throw new UnsupportedOperationException(s"Not supported DataType: $fieldType") } - /** - * Generates the code to create an [[UnsafeRow]] object based on the input expressions. - * @param ctx context for code generation - * @param ev specifies the name of the variable for the output [[UnsafeRow]] object - * @param expressions input expressions - * @return generated code to put the expression output into an [[UnsafeRow]] - */ - def createCode(ctx: CodeGenContext, ev: GeneratedExpressionCode, expressions: Seq[Expression]) - : String = { - - val ret = ev.primitive - ctx.addMutableState("UnsafeRow", ret, s"$ret = new UnsafeRow();") - val buffer = ctx.freshName("buffer") - ctx.addMutableState("byte[]", buffer, s"$buffer = new byte[64];") - val cursor = ctx.freshName("cursor") - val numBytes = ctx.freshName("numBytes") - - val exprs = expressions.map { e => e.dataType match { - case st: StructType => createCodeForStruct(ctx, e.gen(ctx), st) - case _ => e.gen(ctx) - }} - val allExprs = exprs.map(_.code).mkString("\n") - - val fixedSize = 8 * exprs.length + UnsafeRow.calculateBitSetWidthInBytes(exprs.length) - val additionalSize = expressions.zipWithIndex.map { - case (e, i) => genAdditionalSize(e.dataType, exprs(i)) - }.mkString("") - - val writers = expressions.zipWithIndex.map { case (e, i) => - val update = genFieldWriter(ctx, e.dataType, exprs(i), ret, i, cursor) - s"""if (${exprs(i).isNull}) { - $ret.setNullAt($i); - } else { - $update; - }""" - }.mkString("\n ") - - s""" - $allExprs - int $numBytes = $fixedSize $additionalSize; - if ($numBytes > $buffer.length) { - $buffer = new byte[$numBytes]; - } - - $ret.pointTo( - $buffer, - $PlatformDependent.BYTE_ARRAY_OFFSET, - ${expressions.size}, - $numBytes); - int $cursor = $fixedSize; - - $writers - boolean ${ev.isNull} = false; - """ - } - - /** - * Generates the Java code to convert a struct (backed by InternalRow) to UnsafeRow. - * - * This function also handles nested structs by recursively generating the code to do conversion. - * - * @param ctx code generation context - * @param input the input struct, identified by a [[GeneratedExpressionCode]] - * @param schema schema of the struct field - */ - // TODO: refactor createCode and this function to reduce code duplication. - private def createCodeForStruct( - ctx: CodeGenContext, - input: GeneratedExpressionCode, - schema: StructType): GeneratedExpressionCode = { - - val isNull = input.isNull - val primitive = ctx.freshName("structConvert") - ctx.addMutableState("UnsafeRow", primitive, s"$primitive = new UnsafeRow();") - val buffer = ctx.freshName("buffer") - ctx.addMutableState("byte[]", buffer, s"$buffer = new byte[64];") - val cursor = ctx.freshName("cursor") - - val exprs: Seq[GeneratedExpressionCode] = schema.map(_.dataType).zipWithIndex.map { - case (dt, i) => dt match { - case st: StructType => - val nestedStructEv = GeneratedExpressionCode( - code = "", - isNull = s"${input.primitive}.isNullAt($i)", - primitive = s"${ctx.getValue(input.primitive, dt, i.toString)}" - ) - createCodeForStruct(ctx, nestedStructEv, st) - case _ => - GeneratedExpressionCode( - code = "", - isNull = s"${input.primitive}.isNullAt($i)", - primitive = s"${ctx.getValue(input.primitive, dt, i.toString)}" - ) - } - } - val allExprs = exprs.map(_.code).mkString("\n") - - val fixedSize = 8 * exprs.length + UnsafeRow.calculateBitSetWidthInBytes(exprs.length) - val additionalSize = schema.toSeq.map(_.dataType).zip(exprs).map { case (dt, ev) => - genAdditionalSize(dt, ev) - }.mkString("") - - val writers = schema.toSeq.map(_.dataType).zip(exprs).zipWithIndex.map { case ((dt, ev), i) => - val update = genFieldWriter(ctx, dt, ev, primitive, i, cursor) - s""" - if (${exprs(i).isNull}) { - $primitive.setNullAt($i); - } else { - $update; - } - """ - }.mkString("\n ") - - // Note that we add a shortcut here for performance: if the input is already an UnsafeRow, - // just copy the bytes directly into our buffer space without running any conversion. - // We also had to use a hack to introduce a "tmp" variable, to avoid the Java compiler from - // complaining that a GenericMutableRow (generated by expressions) cannot be cast to UnsafeRow. - val tmp = ctx.freshName("tmp") - val numBytes = ctx.freshName("numBytes") - val code = s""" - |${input.code} - |if (!${input.isNull}) { - | Object $tmp = (Object) ${input.primitive}; - | if ($tmp instanceof UnsafeRow) { - | $primitive = (UnsafeRow) $tmp; - | } else { - | $allExprs - | - | int $numBytes = $fixedSize $additionalSize; - | if ($numBytes > $buffer.length) { - | $buffer = new byte[$numBytes]; - | } - | - | $primitive.pointTo( - | $buffer, - | $PlatformDependent.BYTE_ARRAY_OFFSET, - | ${exprs.size}, - | $numBytes); - | int $cursor = $fixedSize; - | - | $writers - | } - |} - """.stripMargin - - GeneratedExpressionCode(code, isNull, primitive) - } - /** * Generates the Java code to convert a struct (backed by InternalRow) to UnsafeRow. * @@ -271,55 +120,70 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro * @param inputs could be the codes for expressions or input struct fields. * @param inputTypes types of the inputs */ - private def createCodeForStruct2( + private def createCodeForStruct( ctx: CodeGenContext, + row: String, inputs: Seq[GeneratedExpressionCode], inputTypes: Seq[DataType]): GeneratedExpressionCode = { + val fixedSize = 8 * inputTypes.length + UnsafeRow.calculateBitSetWidthInBytes(inputTypes.length) + val output = ctx.freshName("convertedStruct") - ctx.addMutableState("UnsafeRow", output, s"$output = new UnsafeRow();") + ctx.addMutableState("UnsafeRow", output, s"this.$output = new UnsafeRow();") val buffer = ctx.freshName("buffer") - ctx.addMutableState("byte[]", buffer, s"$buffer = new byte[64];") - val numBytes = ctx.freshName("numBytes") + ctx.addMutableState("byte[]", buffer, s"this.$buffer = new byte[$fixedSize];") val cursor = ctx.freshName("cursor") + ctx.addMutableState("int", cursor, s"this.$cursor = 0;") + val tmp = ctx.freshName("tmpBuffer") - val convertedFields = inputTypes.zip(inputs).map { case (dt, input) => - createConvertCode(ctx, input, dt) - } - - val fixedSize = 8 * inputTypes.length + UnsafeRow.calculateBitSetWidthInBytes(inputTypes.length) - val additionalSize = inputTypes.zip(convertedFields).map { case (dt, ev) => - genAdditionalSize(dt, ev) - }.mkString("") - - val fieldWriters = inputTypes.zip(convertedFields).zipWithIndex.map { case ((dt, ev), i) => - val update = genFieldWriter(ctx, dt, ev, output, i, cursor) + val convertedFields = inputTypes.zip(inputs).zipWithIndex.map { case ((dt, input), i) => + val ev = createConvertCode(ctx, input, dt) + val growBuffer = if (!UnsafeRow.isFixedLength(dt)) { + val numBytes = ctx.freshName("numBytes") + s""" + int $numBytes = $cursor + (${genAdditionalSize(dt, ev)}); + if ($buffer.length < $numBytes) { + // This will not happen frequently, because the buffer is re-used. + byte[] $tmp = new byte[$numBytes * 2]; + Platform.copyMemory($buffer, Platform.BYTE_ARRAY_OFFSET, + $tmp, Platform.BYTE_ARRAY_OFFSET, $buffer.length); + $buffer = $tmp; + } + $output.pointTo($buffer, Platform.BYTE_ARRAY_OFFSET, ${inputTypes.length}, $numBytes); + """ + } else { + "" + } + val update = dt match { + case dt: DecimalType if dt.precision > Decimal.MAX_LONG_DIGITS => + // Can't call setNullAt() for DecimalType + s""" + if (${ev.isNull}) { + $cursor += $DecimalWriter.write($output, $i, $cursor, null); + } else { + ${genFieldWriter(ctx, dt, ev, output, i, cursor)}; + } + """ + case _ => + s""" + if (${ev.isNull}) { + $output.setNullAt($i); + } else { + ${genFieldWriter(ctx, dt, ev, output, i, cursor)}; + } + """ + } s""" - if (${ev.isNull}) { - $output.setNullAt($i); - } else { - $update; - } + ${ev.code} + $growBuffer + $update """ - }.mkString("\n") + } val code = s""" - ${convertedFields.map(_.code).mkString("\n")} - - final int $numBytes = $fixedSize $additionalSize; - if ($numBytes > $buffer.length) { - $buffer = new byte[$numBytes]; - } - - $output.pointTo( - $buffer, - $PlatformDependent.BYTE_ARRAY_OFFSET, - ${inputTypes.length}, - $numBytes); - - int $cursor = $fixedSize; - - $fieldWriters + $cursor = $fixedSize; + $output.pointTo($buffer, Platform.BYTE_ARRAY_OFFSET, ${inputTypes.length}, $cursor); + ${ctx.splitExpressions(row, convertedFields)} """ GeneratedExpressionCode(code, "false", output) } @@ -360,7 +224,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro // go through the input array to calculate how many bytes we need. val calculateNumBytes = elementType match { - case _ if (ctx.isPrimitiveType(elementType)) => + case _ if ctx.isPrimitiveType(elementType) => // Should we do word align? val elementSize = elementType.defaultSize s""" @@ -373,6 +237,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro case _ => val writer = getWriter(elementType) val elementSize = s"$writer.getSize($elements[$index])" + // TODO(davies): avoid the copy val unsafeType = elementType match { case _: StructType => "UnsafeRow" case _: ArrayType => "UnsafeArrayData" @@ -385,8 +250,13 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro case _ => "" } + val newElements = if (elementType == BinaryType) { + s"new byte[$numElements][]" + } else { + s"new $unsafeType[$numElements]" + } s""" - final $unsafeType[] $elements = new $unsafeType[$numElements]; + final $unsafeType[] $elements = $newElements; for (int $index = 0; $index < $numElements; $index++) { ${convertedElement.code} if (!${convertedElement.isNull}) { @@ -398,21 +268,21 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro } val writeElement = elementType match { - case _ if (ctx.isPrimitiveType(elementType)) => + case _ if ctx.isPrimitiveType(elementType) => // Should we do word align? val elementSize = elementType.defaultSize s""" - $PlatformDependent.UNSAFE.put${ctx.primitiveTypeName(elementType)}( + Platform.put${ctx.primitiveTypeName(elementType)}( $buffer, - $PlatformDependent.BYTE_ARRAY_OFFSET + $cursor, + Platform.BYTE_ARRAY_OFFSET + $cursor, ${convertedElement.primitive}); $cursor += $elementSize; """ case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS => s""" - $PlatformDependent.UNSAFE.putLong( + Platform.putLong( $buffer, - $PlatformDependent.BYTE_ARRAY_OFFSET + $cursor, + Platform.BYTE_ARRAY_OFFSET + $cursor, ${convertedElement.primitive}.toUnscaledLong()); $cursor += 8; """ @@ -421,7 +291,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro s""" $cursor += $writer.write( $buffer, - $PlatformDependent.BYTE_ARRAY_OFFSET + $cursor, + Platform.BYTE_ARRAY_OFFSET + $cursor, $elements[$index]); """ } @@ -455,23 +325,16 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro for (int $index = 0; $index < $numElements; $index++) { if ($checkNull) { // If element is null, write the negative value address into offset region. - $PlatformDependent.UNSAFE.putInt( - $buffer, - $PlatformDependent.BYTE_ARRAY_OFFSET + 4 * $index, - -$cursor); + Platform.putInt($buffer, Platform.BYTE_ARRAY_OFFSET + 4 * $index, -$cursor); } else { - $PlatformDependent.UNSAFE.putInt( - $buffer, - $PlatformDependent.BYTE_ARRAY_OFFSET + 4 * $index, - $cursor); - + Platform.putInt($buffer, Platform.BYTE_ARRAY_OFFSET + 4 * $index, $cursor); $writeElement } } $output.pointTo( $buffer, - $PlatformDependent.BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, $numElements, $numBytes); } @@ -537,7 +400,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro val fieldIsNull = s"$tmp.isNullAt($i)" GeneratedExpressionCode("", fieldIsNull, getFieldCode) } - val converter = createCodeForStruct2(ctx, fieldEvals, fieldTypes) + val converter = createCodeForStruct(ctx, tmp, fieldEvals, fieldTypes) val code = s""" ${input.code} UnsafeRow $output = null; @@ -561,6 +424,12 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro case _ => input } + def createCode(ctx: CodeGenContext, expressions: Seq[Expression]): GeneratedExpressionCode = { + val exprEvals = expressions.map(e => e.gen(ctx)) + val exprTypes = expressions.map(_.dataType) + createCodeForStruct(ctx, "i", exprEvals, exprTypes) + } + protected def canonicalize(in: Seq[Expression]): Seq[Expression] = in.map(ExpressionCanonicalizer.execute) @@ -570,8 +439,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro protected def create(expressions: Seq[Expression]): UnsafeProjection = { val ctx = newCodeGenContext() - val exprEvals = expressions.map(e => e.gen(ctx)) - val eval = createCodeForStruct2(ctx, exprEvals, expressions.map(_.dataType)) + val eval = createCode(ctx, expressions) val code = s""" public Object generate($exprType[] exprs) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala index 30b51dd83fa9..da91ff29537b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, Attribute} import org.apache.spark.sql.types.StructType -import org.apache.spark.unsafe.PlatformDependent +import org.apache.spark.unsafe.Platform abstract class UnsafeRowJoiner { @@ -52,9 +52,9 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U } def create(schema1: StructType, schema2: StructType): UnsafeRowJoiner = { - val offset = PlatformDependent.BYTE_ARRAY_OFFSET - val getLong = "PlatformDependent.UNSAFE.getLong" - val putLong = "PlatformDependent.UNSAFE.putLong" + val offset = Platform.BYTE_ARRAY_OFFSET + val getLong = "Platform.getLong" + val putLong = "Platform.putLong" val bitset1Words = (schema1.size + 63) / 64 val bitset2Words = (schema2.size + 63) / 64 @@ -96,7 +96,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U var cursor = offset + outputBitsetWords * 8 val copyFixedLengthRow1 = s""" |// Copy fixed length data for row1 - |PlatformDependent.copyMemory( + |Platform.copyMemory( | obj1, offset1 + ${bitset1Words * 8}, | buf, $cursor, | ${schema1.size * 8}); @@ -106,7 +106,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U // --------------------- copy fixed length portion from row 2 ----------------------- // val copyFixedLengthRow2 = s""" |// Copy fixed length data for row2 - |PlatformDependent.copyMemory( + |Platform.copyMemory( | obj2, offset2 + ${bitset2Words * 8}, | buf, $cursor, | ${schema2.size * 8}); @@ -118,7 +118,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U val copyVariableLengthRow1 = s""" |// Copy variable length data for row1 |long numBytesVariableRow1 = row1.getSizeInBytes() - $numBytesBitsetAndFixedRow1; - |PlatformDependent.copyMemory( + |Platform.copyMemory( | obj1, offset1 + ${(bitset1Words + schema1.size) * 8}, | buf, $cursor, | numBytesVariableRow1); @@ -129,7 +129,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U val copyVariableLengthRow2 = s""" |// Copy variable length data for row2 |long numBytesVariableRow2 = row2.getSizeInBytes() - $numBytesBitsetAndFixedRow2; - |PlatformDependent.copyMemory( + |Platform.copyMemory( | obj2, offset2 + ${(bitset2Words + schema2.size) * 8}, | buf, $cursor + numBytesVariableRow1, | numBytesVariableRow2); @@ -155,7 +155,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U |$putLong(buf, $cursor, $getLong(buf, $cursor) + ($shift << 32)); """.stripMargin } - }.mkString + }.mkString("\n") // ------------------------ Finally, put everything together --------------------------- // val code = s""" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 6ccb56578f79..7b8c5b723ded 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -19,8 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator import org.apache.spark.sql.catalyst.analysis.TypeCheckResult -import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenFallback, CodeGenContext, GeneratedExpressionCode} -import org.apache.spark.sql.catalyst.util.TypeUtils +import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, CodegenFallback, GeneratedExpressionCode} import org.apache.spark.sql.types._ /** @@ -115,3 +114,72 @@ case class SortArray(base: Expression, ascendingOrder: Expression) override def prettyName: String = "sort_array" } + +/** + * Checks if the array (left) has the element (right) + */ +case class ArrayContains(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = right.dataType match { + case NullType => Seq() + case _ => left.dataType match { + case n @ ArrayType(element, _) => Seq(n, element) + case _ => Seq() + } + } + + override def checkInputDataTypes(): TypeCheckResult = { + if (right.dataType == NullType) { + TypeCheckResult.TypeCheckFailure("Null typed values cannot be used as arguments") + } else if (!left.dataType.isInstanceOf[ArrayType] + || left.dataType.asInstanceOf[ArrayType].elementType != right.dataType) { + TypeCheckResult.TypeCheckFailure( + "Arguments must be an array followed by a value of same type as the array members") + } else { + TypeCheckResult.TypeCheckSuccess + } + } + + override def nullable: Boolean = { + left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(arr: Any, value: Any): Any = { + var hasNull = false + arr.asInstanceOf[ArrayData].foreach(right.dataType, (i, v) => + if (v == null) { + hasNull = true + } else if (v == value) { + return true + } + ) + if (hasNull) { + null + } else { + false + } + } + + override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + nullSafeCodeGen(ctx, ev, (arr, value) => { + val i = ctx.freshName("i") + val getValue = ctx.getValue(arr, right.dataType, i) + s""" + for (int $i = 0; $i < $arr.numElements(); $i ++) { + if ($arr.isNullAt($i)) { + ${ev.isNull} = true; + } else if (${ctx.genEqual(right.dataType, value, getValue)}) { + ${ev.isNull} = false; + ${ev.primitive} = true; + break; + } + } + """ + }) + } + + override def prettyName: String = "array_contains" +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index a145dfb4bbf0..1c546719730b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -75,8 +75,6 @@ case class CreateStruct(children: Seq[Expression]) extends Expression { override def foldable: Boolean = children.forall(_.foldable) - override lazy val resolved: Boolean = childrenResolved - override lazy val dataType: StructType = { val fields = children.zipWithIndex.map { case (child, idx) => child match { @@ -208,10 +206,15 @@ case class CreateStructUnsafe(children: Seq[Expression]) extends Expression { override def nullable: Boolean = false - override def eval(input: InternalRow): Any = throw new UnsupportedOperationException + override def eval(input: InternalRow): Any = { + InternalRow(children.map(_.eval(input)): _*) + } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { - GenerateUnsafeProjection.createCode(ctx, ev, children) + val eval = GenerateUnsafeProjection.createCode(ctx, children) + ev.isNull = eval.isNull + ev.primitive = eval.primitive + eval.code } override def prettyName: String = "struct_unsafe" @@ -243,10 +246,15 @@ case class CreateNamedStructUnsafe(children: Seq[Expression]) extends Expression override def nullable: Boolean = false - override def eval(input: InternalRow): Any = throw new UnsupportedOperationException + override def eval(input: InternalRow): Any = { + InternalRow(valExprs.map(_.eval(input)): _*) + } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { - GenerateUnsafeProjection.createCode(ctx, ev, valExprs) + val eval = GenerateUnsafeProjection.createCode(ctx, valExprs) + ev.isNull = eval.isNull + ev.primitive = eval.primitive + eval.code } override def prettyName: String = "named_struct_unsafe" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala similarity index 98% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala index 961b1d861680..d51f3d3cef58 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala @@ -319,7 +319,7 @@ case class Least(children: Seq[Expression]) extends Expression { override def nullable: Boolean = children.forall(_.nullable) override def foldable: Boolean = children.forall(_.foldable) - private lazy val ordering = TypeUtils.getOrdering(dataType) + private lazy val ordering = TypeUtils.getInterpretedOrdering(dataType) override def checkInputDataTypes(): TypeCheckResult = { if (children.length <= 1) { @@ -374,7 +374,7 @@ case class Greatest(children: Seq[Expression]) extends Expression { override def nullable: Boolean = children.forall(_.nullable) override def foldable: Boolean = children.forall(_.foldable) - private lazy val ordering = TypeUtils.getOrdering(dataType) + private lazy val ordering = TypeUtils.getInterpretedOrdering(dataType) override def checkInputDataTypes(): TypeCheckResult = { if (children.length <= 1) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala similarity index 100% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala similarity index 72% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala index adb33e4c8d4a..b7be12f7aa74 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala @@ -66,10 +66,44 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un * An expression used to wrap the children when promote the precision of DecimalType to avoid * promote multiple times. */ -case class ChangeDecimalPrecision(child: Expression) extends UnaryExpression { +case class PromotePrecision(child: Expression) extends UnaryExpression { override def dataType: DataType = child.dataType override def eval(input: InternalRow): Any = child.eval(input) override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx) override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = "" - override def prettyName: String = "change_decimal_precision" + override def prettyName: String = "promote_precision" +} + +/** + * Rounds the decimal to given scale and check whether the decimal can fit in provided precision + * or not, returns null if not. + */ +case class CheckOverflow(child: Expression, dataType: DecimalType) extends UnaryExpression { + + override def nullable: Boolean = true + + override def nullSafeEval(input: Any): Any = { + val d = input.asInstanceOf[Decimal].clone() + if (d.changePrecision(dataType.precision, dataType.scale)) { + d + } else { + null + } + } + + override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + nullSafeCodeGen(ctx, ev, eval => { + val tmp = ctx.freshName("tmp") + s""" + | Decimal $tmp = $eval.clone(); + | if ($tmp.changePrecision(${dataType.precision}, ${dataType.scale})) { + | ${ev.primitive} = $tmp; + | } else { + | ${ev.isNull} = true; + | } + """.stripMargin + }) + } + + override def toString: String = s"CheckOverflow($child, $dataType)" } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala index d474853355e5..c0845e1a0102 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.catalyst.expressions -import scala.collection.Map - import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.TypeCheckResult diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala new file mode 100644 index 000000000000..23bfa18c9428 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import java.io.{StringWriter, ByteArrayOutputStream} + +import com.fasterxml.jackson.core._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.types.{StringType, DataType} +import org.apache.spark.unsafe.types.UTF8String + +import scala.util.parsing.combinator.RegexParsers + +private[this] sealed trait PathInstruction +private[this] object PathInstruction { + private[expressions] case object Subscript extends PathInstruction + private[expressions] case object Wildcard extends PathInstruction + private[expressions] case object Key extends PathInstruction + private[expressions] case class Index(index: Long) extends PathInstruction + private[expressions] case class Named(name: String) extends PathInstruction +} + +private[this] sealed trait WriteStyle +private[this] object WriteStyle { + private[expressions] case object RawStyle extends WriteStyle + private[expressions] case object QuotedStyle extends WriteStyle + private[expressions] case object FlattenStyle extends WriteStyle +} + +private[this] object JsonPathParser extends RegexParsers { + import PathInstruction._ + + def root: Parser[Char] = '$' + + def long: Parser[Long] = "\\d+".r ^? { + case x => x.toLong + } + + // parse `[*]` and `[123]` subscripts + def subscript: Parser[List[PathInstruction]] = + for { + operand <- '[' ~> ('*' ^^^ Wildcard | long ^^ Index) <~ ']' + } yield { + Subscript :: operand :: Nil + } + + // parse `.name` or `['name']` child expressions + def named: Parser[List[PathInstruction]] = + for { + name <- '.' ~> "[^\\.\\[]+".r | "[\\'" ~> "[^\\'\\?]+" <~ "\\']" + } yield { + Key :: Named(name) :: Nil + } + + // child wildcards: `..`, `.*` or `['*']` + def wildcard: Parser[List[PathInstruction]] = + (".*" | "['*']") ^^^ List(Wildcard) + + def node: Parser[List[PathInstruction]] = + wildcard | + named | + subscript + + val expression: Parser[List[PathInstruction]] = { + phrase(root ~> rep(node) ^^ (x => x.flatten)) + } + + def parse(str: String): Option[List[PathInstruction]] = { + this.parseAll(expression, str) match { + case Success(result, _) => + Some(result) + + case NoSuccess(msg, next) => + None + } + } +} + +private[this] object GetJsonObject { + private val jsonFactory = new JsonFactory() + + // Enabled for Hive compatibility + jsonFactory.enable(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS) +} + +/** + * Extracts json object from a json string based on json path specified, and returns json string + * of the extracted json object. It will return null if the input json string is invalid. + */ +case class GetJsonObject(json: Expression, path: Expression) + extends BinaryExpression with ExpectsInputTypes with CodegenFallback { + + import GetJsonObject._ + import PathInstruction._ + import WriteStyle._ + import com.fasterxml.jackson.core.JsonToken._ + + override def left: Expression = json + override def right: Expression = path + override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + override def dataType: DataType = StringType + override def prettyName: String = "get_json_object" + + @transient private lazy val parsedPath = parsePath(path.eval().asInstanceOf[UTF8String]) + + override def eval(input: InternalRow): Any = { + val jsonStr = json.eval(input).asInstanceOf[UTF8String] + if (jsonStr == null) { + return null + } + + val parsed = if (path.foldable) { + parsedPath + } else { + parsePath(path.eval(input).asInstanceOf[UTF8String]) + } + + if (parsed.isDefined) { + try { + val parser = jsonFactory.createParser(jsonStr.getBytes) + val output = new ByteArrayOutputStream() + val generator = jsonFactory.createGenerator(output, JsonEncoding.UTF8) + parser.nextToken() + val matched = evaluatePath(parser, generator, RawStyle, parsed.get) + generator.close() + if (matched) { + UTF8String.fromBytes(output.toByteArray) + } else { + null + } + } catch { + case _: JsonProcessingException => null + } + } else { + null + } + } + + private def parsePath(path: UTF8String): Option[List[PathInstruction]] = { + if (path != null) { + JsonPathParser.parse(path.toString) + } else { + None + } + } + + // advance to the desired array index, assumes to start at the START_ARRAY token + private def arrayIndex(p: JsonParser, f: () => Boolean): Long => Boolean = { + case _ if p.getCurrentToken == END_ARRAY => + // terminate, nothing has been written + false + + case 0 => + // we've reached the desired index + val dirty = f() + + while (p.nextToken() != END_ARRAY) { + // advance the token stream to the end of the array + p.skipChildren() + } + + dirty + + case i if i > 0 => + // skip this token and evaluate the next + p.skipChildren() + p.nextToken() + arrayIndex(p, f)(i - 1) + } + + /** + * Evaluate a list of JsonPath instructions, returning a bool that indicates if any leaf nodes + * have been written to the generator + */ + private def evaluatePath( + p: JsonParser, + g: JsonGenerator, + style: WriteStyle, + path: List[PathInstruction]): Boolean = { + (p.getCurrentToken, path) match { + case (VALUE_STRING, Nil) if style == RawStyle => + // there is no array wildcard or slice parent, emit this string without quotes + if (p.hasTextCharacters) { + g.writeRaw(p.getTextCharacters, p.getTextOffset, p.getTextLength) + } else { + g.writeRaw(p.getText) + } + true + + case (START_ARRAY, Nil) if style == FlattenStyle => + // flatten this array into the parent + var dirty = false + while (p.nextToken() != END_ARRAY) { + dirty |= evaluatePath(p, g, style, Nil) + } + dirty + + case (_, Nil) => + // general case: just copy the child tree verbatim + g.copyCurrentStructure(p) + true + + case (START_OBJECT, Key :: xs) => + var dirty = false + while (p.nextToken() != END_OBJECT) { + if (dirty) { + // once a match has been found we can skip other fields + p.skipChildren() + } else { + dirty = evaluatePath(p, g, style, xs) + } + } + dirty + + case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs) => + // special handling for the non-structure preserving double wildcard behavior in Hive + var dirty = false + g.writeStartArray() + while (p.nextToken() != END_ARRAY) { + dirty |= evaluatePath(p, g, FlattenStyle, xs) + } + g.writeEndArray() + dirty + + case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle => + // retain Flatten, otherwise use Quoted... cannot use Raw within an array + val nextStyle = style match { + case RawStyle => QuotedStyle + case FlattenStyle => FlattenStyle + case QuotedStyle => throw new IllegalStateException() + } + + // temporarily buffer child matches, the emitted json will need to be + // modified slightly if there is only a single element written + val buffer = new StringWriter() + val flattenGenerator = jsonFactory.createGenerator(buffer) + flattenGenerator.writeStartArray() + + var dirty = 0 + while (p.nextToken() != END_ARRAY) { + // track the number of array elements and only emit an outer array if + // we've written more than one element, this matches Hive's behavior + dirty += (if (evaluatePath(p, flattenGenerator, nextStyle, xs)) 1 else 0) + } + flattenGenerator.writeEndArray() + flattenGenerator.close() + + val buf = buffer.getBuffer + if (dirty > 1) { + g.writeRawValue(buf.toString) + } else if (dirty == 1) { + // remove outer array tokens + g.writeRawValue(buf.substring(1, buf.length()-1)) + } // else do not write anything + + dirty > 0 + + case (START_ARRAY, Subscript :: Wildcard :: xs) => + var dirty = false + g.writeStartArray() + while (p.nextToken() != END_ARRAY) { + // wildcards can have multiple matches, continually update the dirty count + dirty |= evaluatePath(p, g, QuotedStyle, xs) + } + g.writeEndArray() + + dirty + + case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) => + p.nextToken() + // we're going to have 1 or more results, switch to QuotedStyle + arrayIndex(p, () => evaluatePath(p, g, QuotedStyle, xs))(idx) + + case (START_ARRAY, Subscript :: Index(idx) :: xs) => + p.nextToken() + arrayIndex(p, () => evaluatePath(p, g, style, xs))(idx) + + case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name => + // exact field match + p.nextToken() + evaluatePath(p, g, style, xs) + + case (FIELD_NAME, Wildcard :: xs) => + // wildcard field match + p.nextToken() + evaluatePath(p, g, style, xs) + + case _ => + p.skipChildren() + false + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 34bad23802ba..8c0c5d5b1e31 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -36,9 +36,10 @@ object Literal { case s: Short => Literal(s, ShortType) case s: String => Literal(UTF8String.fromString(s), StringType) case b: Boolean => Literal(b, BooleanType) - case d: BigDecimal => Literal(Decimal(d), DecimalType(d.precision, d.scale)) - case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType(d.precision(), d.scale())) - case d: Decimal => Literal(d, DecimalType(d.precision, d.scale)) + case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale)) + case d: java.math.BigDecimal => + Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale())) + case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale)) case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType) case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType) case a: Array[Byte] => Literal(a, BinaryType) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala similarity index 100% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala similarity index 85% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala index d58c4756938c..287718fab7f0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala @@ -210,58 +210,14 @@ case class IsNotNull(child: Expression) extends UnaryExpression with Predicate { } } -/** - * A predicate that is evaluated to be true if there are at least `n` null values. - */ -case class AtLeastNNulls(n: Int, children: Seq[Expression]) extends Predicate { - override def nullable: Boolean = false - override def foldable: Boolean = children.forall(_.foldable) - override def toString: String = s"AtLeastNNulls($n, ${children.mkString(",")})" - - private[this] val childrenArray = children.toArray - - override def eval(input: InternalRow): Boolean = { - var numNulls = 0 - var i = 0 - while (i < childrenArray.length && numNulls < n) { - val evalC = childrenArray(i).eval(input) - if (evalC == null) { - numNulls += 1 - } - i += 1 - } - numNulls >= n - } - - override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { - val numNulls = ctx.freshName("numNulls") - val code = children.map { e => - val eval = e.gen(ctx) - s""" - if ($numNulls < $n) { - ${eval.code} - if (${eval.isNull}) { - $numNulls += 1; - } - } - """ - }.mkString("\n") - s""" - int $numNulls = 0; - $code - boolean ${ev.isNull} = false; - boolean ${ev.primitive} = $numNulls >= $n; - """ - } -} /** * A predicate that is evaluated to be true if there are at least `n` non-null and non-NaN values. */ -case class AtLeastNNonNullNans(n: Int, children: Seq[Expression]) extends Predicate { +case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate { override def nullable: Boolean = false override def foldable: Boolean = children.forall(_.foldable) - override def toString: String = s"AtLeastNNonNullNans($n, ${children.mkString(",")})" + override def toString: String = s"AtLeastNNulls(n, ${children.mkString(",")})" private[this] val childrenArray = children.toArray diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/RowOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ordering.scala similarity index 85% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/RowOrdering.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ordering.scala index 873f5324c573..6407c73bc97d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/RowOrdering.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ordering.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.types._ /** * An interpreted row ordering comparator. */ -class RowOrdering(ordering: Seq[SortOrder]) extends Ordering[InternalRow] { +class InterpretedOrdering(ordering: Seq[SortOrder]) extends Ordering[InternalRow] { def this(ordering: Seq[SortOrder], inputSchema: Seq[Attribute]) = this(ordering.map(BindReferences.bindReference(_, inputSchema))) @@ -49,9 +49,9 @@ class RowOrdering(ordering: Seq[SortOrder]) extends Ordering[InternalRow] { case dt: AtomicType if order.direction == Descending => dt.ordering.asInstanceOf[Ordering[Any]].reverse.compare(left, right) case s: StructType if order.direction == Ascending => - s.ordering.asInstanceOf[Ordering[Any]].compare(left, right) + s.interpretedOrdering.asInstanceOf[Ordering[Any]].compare(left, right) case s: StructType if order.direction == Descending => - s.ordering.asInstanceOf[Ordering[Any]].reverse.compare(left, right) + s.interpretedOrdering.asInstanceOf[Ordering[Any]].reverse.compare(left, right) case other => throw new IllegalArgumentException(s"Type $other does not support ordered operations") } @@ -65,6 +65,18 @@ class RowOrdering(ordering: Seq[SortOrder]) extends Ordering[InternalRow] { } } +object InterpretedOrdering { + + /** + * Creates a [[InterpretedOrdering]] for the given schema, in natural ascending order. + */ + def forSchema(dataTypes: Seq[DataType]): InterpretedOrdering = { + new InterpretedOrdering(dataTypes.zipWithIndex.map { + case (dt, index) => new SortOrder(BoundReference(index, dt, nullable = true), Ascending) + }) + } +} + object RowOrdering { /** @@ -81,13 +93,4 @@ object RowOrdering { * Returns true iff outputs from the expressions can be ordered. */ def isOrderable(exprs: Seq[Expression]): Boolean = exprs.forall(e => isOrderable(e.dataType)) - - /** - * Creates a [[RowOrdering]] for the given schema, in natural ascending order. - */ - def forSchema(dataTypes: Seq[DataType]): RowOrdering = { - new RowOrdering(dataTypes.zipWithIndex.map { - case (dt, index) => new SortOrder(BoundReference(index, dt, nullable = true), Ascending) - }) - } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index b69bbabee7e8..65706dba7d97 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -17,8 +17,9 @@ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenFallback, GeneratedExpressionCode, CodeGenContext} import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types._ @@ -97,31 +98,124 @@ case class Not(child: Expression) /** * Evaluates to `true` if `list` contains `value`. */ -case class In(value: Expression, list: Seq[Expression]) extends Predicate with CodegenFallback { +case class In(value: Expression, list: Seq[Expression]) extends Predicate + with ImplicitCastInputTypes { + + require(list != null, "list should not be null") + + override def inputTypes: Seq[AbstractDataType] = value.dataType +: list.map(_.dataType) + + override def checkInputDataTypes(): TypeCheckResult = { + if (list.exists(l => l.dataType != value.dataType)) { + TypeCheckResult.TypeCheckFailure( + "Arguments must be same type") + } else { + TypeCheckResult.TypeCheckSuccess + } + } + override def children: Seq[Expression] = value +: list - override def nullable: Boolean = true // TODO: Figure out correct nullability semantics of IN. + override def nullable: Boolean = children.exists(_.nullable) + override def foldable: Boolean = children.forall(_.foldable) + override def toString: String = s"$value IN ${list.mkString("(", ",", ")")}" override def eval(input: InternalRow): Any = { val evaluatedValue = value.eval(input) - list.exists(e => e.eval(input) == evaluatedValue) + if (evaluatedValue == null) { + null + } else { + var hasNull = false + list.foreach { e => + val v = e.eval(input) + if (v == evaluatedValue) { + return true + } else if (v == null) { + hasNull = true + } + } + if (hasNull) { + null + } else { + false + } + } } -} + override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + val valueGen = value.gen(ctx) + val listGen = list.map(_.gen(ctx)) + val listCode = listGen.map(x => + s""" + if (!${ev.primitive}) { + ${x.code} + if (${x.isNull}) { + ${ev.isNull} = true; + } else if (${ctx.genEqual(value.dataType, valueGen.primitive, x.primitive)}) { + ${ev.isNull} = false; + ${ev.primitive} = true; + } + } + """).mkString("\n") + s""" + ${valueGen.code} + boolean ${ev.primitive} = false; + boolean ${ev.isNull} = ${valueGen.isNull}; + if (!${ev.isNull}) { + $listCode + } + """ + } +} /** * Optimized version of In clause, when all filter values of In clause are * static. */ -case class InSet(child: Expression, hset: Set[Any]) - extends UnaryExpression with Predicate with CodegenFallback { +case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with Predicate { + + require(hset != null, "hset could not be null") - override def nullable: Boolean = true // TODO: Figure out correct nullability semantics of IN. override def toString: String = s"$child INSET ${hset.mkString("(", ",", ")")}" - override def eval(input: InternalRow): Any = { - hset.contains(child.eval(input)) + @transient private[this] lazy val hasNull: Boolean = hset.contains(null) + + override def nullable: Boolean = child.nullable || hasNull + + protected override def nullSafeEval(value: Any): Any = { + if (hset.contains(value)) { + true + } else if (hasNull) { + null + } else { + false + } + } + + def getHSet(): Set[Any] = hset + + override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + val setName = classOf[Set[Any]].getName + val InSetName = classOf[InSet].getName + val childGen = child.gen(ctx) + ctx.references += this + val hsetTerm = ctx.freshName("hset") + val hasNullTerm = ctx.freshName("hasNull") + ctx.addMutableState(setName, hsetTerm, + s"$hsetTerm = (($InSetName)expressions[${ctx.references.size - 1}]).getHSet();") + ctx.addMutableState("boolean", hasNullTerm, s"$hasNullTerm = $hsetTerm.contains(null);") + s""" + ${childGen.code} + boolean ${ev.isNull} = ${childGen.isNull}; + boolean ${ev.primitive} = false; + if (!${ev.isNull}) { + ${ev.primitive} = $hsetTerm.contains(${childGen.primitive}); + if (!${ev.primitive} && $hasNullTerm) { + ${ev.isNull} = true; + } + } + """ } } @@ -325,7 +419,7 @@ case class LessThan(left: Expression, right: Expression) extends BinaryCompariso override def symbol: String = "<" - private lazy val ordering = TypeUtils.getOrdering(left.dataType) + private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType) protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lt(input1, input2) } @@ -337,7 +431,7 @@ case class LessThanOrEqual(left: Expression, right: Expression) extends BinaryCo override def symbol: String = "<=" - private lazy val ordering = TypeUtils.getOrdering(left.dataType) + private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType) protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lteq(input1, input2) } @@ -349,7 +443,7 @@ case class GreaterThan(left: Expression, right: Expression) extends BinaryCompar override def symbol: String = ">" - private lazy val ordering = TypeUtils.getOrdering(left.dataType) + private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType) protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gt(input1, input2) } @@ -361,7 +455,7 @@ case class GreaterThanOrEqual(left: Expression, right: Expression) extends Binar override def symbol: String = ">=" - private lazy val ordering = TypeUtils.getOrdering(left.dataType) + private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType) protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gteq(input1, input2) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala similarity index 100% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala new file mode 100644 index 000000000000..6dff28a7cde4 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -0,0 +1,346 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import java.util.regex.{MatchResult, Pattern} + +import org.apache.commons.lang3.StringEscapeUtils + +import org.apache.spark.sql.catalyst.expressions.codegen._ +import org.apache.spark.sql.catalyst.util.StringUtils +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + + +trait StringRegexExpression extends ImplicitCastInputTypes { + self: BinaryExpression => + + def escape(v: String): String + def matches(regex: Pattern, str: String): Boolean + + override def dataType: DataType = BooleanType + override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + + // try cache the pattern for Literal + private lazy val cache: Pattern = right match { + case x @ Literal(value: String, StringType) => compile(value) + case _ => null + } + + protected def compile(str: String): Pattern = if (str == null) { + null + } else { + // Let it raise exception if couldn't compile the regex string + Pattern.compile(escape(str)) + } + + protected def pattern(str: String) = if (cache == null) compile(str) else cache + + protected override def nullSafeEval(input1: Any, input2: Any): Any = { + val regex = pattern(input2.asInstanceOf[UTF8String].toString) + if(regex == null) { + null + } else { + matches(regex, input1.asInstanceOf[UTF8String].toString) + } + } +} + + +/** + * Simple RegEx pattern matching function + */ +case class Like(left: Expression, right: Expression) + extends BinaryExpression with StringRegexExpression with CodegenFallback { + + override def escape(v: String): String = StringUtils.escapeLikeRegex(v) + + override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches() + + override def toString: String = s"$left LIKE $right" + + override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + val patternClass = classOf[Pattern].getName + val escapeFunc = StringUtils.getClass.getName.stripSuffix("$") + ".escapeLikeRegex" + val pattern = ctx.freshName("pattern") + + if (right.foldable) { + val rVal = right.eval() + if (rVal != null) { + val regexStr = + StringEscapeUtils.escapeJava(escape(rVal.asInstanceOf[UTF8String].toString())) + ctx.addMutableState(patternClass, pattern, + s"""$pattern = ${patternClass}.compile("$regexStr");""") + + // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again. + val eval = left.gen(ctx) + s""" + ${eval.code} + boolean ${ev.isNull} = ${eval.isNull}; + ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; + if (!${ev.isNull}) { + ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).matches(); + } + """ + } else { + s""" + boolean ${ev.isNull} = true; + ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; + """ + } + } else { + nullSafeCodeGen(ctx, ev, (eval1, eval2) => { + s""" + String rightStr = ${eval2}.toString(); + ${patternClass} $pattern = ${patternClass}.compile($escapeFunc(rightStr)); + ${ev.primitive} = $pattern.matcher(${eval1}.toString()).matches(); + """ + }) + } + } +} + + +case class RLike(left: Expression, right: Expression) + extends BinaryExpression with StringRegexExpression with CodegenFallback { + + override def escape(v: String): String = v + override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0) + override def toString: String = s"$left RLIKE $right" + + override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + val patternClass = classOf[Pattern].getName + val pattern = ctx.freshName("pattern") + + if (right.foldable) { + val rVal = right.eval() + if (rVal != null) { + val regexStr = + StringEscapeUtils.escapeJava(rVal.asInstanceOf[UTF8String].toString()) + ctx.addMutableState(patternClass, pattern, + s"""$pattern = ${patternClass}.compile("$regexStr");""") + + // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again. + val eval = left.gen(ctx) + s""" + ${eval.code} + boolean ${ev.isNull} = ${eval.isNull}; + ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; + if (!${ev.isNull}) { + ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).find(0); + } + """ + } else { + s""" + boolean ${ev.isNull} = true; + ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; + """ + } + } else { + nullSafeCodeGen(ctx, ev, (eval1, eval2) => { + s""" + String rightStr = ${eval2}.toString(); + ${patternClass} $pattern = ${patternClass}.compile(rightStr); + ${ev.primitive} = $pattern.matcher(${eval1}.toString()).find(0); + """ + }) + } + } +} + + +/** + * Splits str around pat (pattern is a regular expression). + */ +case class StringSplit(str: Expression, pattern: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + override def left: Expression = str + override def right: Expression = pattern + override def dataType: DataType = ArrayType(StringType) + override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + + override def nullSafeEval(string: Any, regex: Any): Any = { + val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1) + new GenericArrayData(strings.asInstanceOf[Array[Any]]) + } + + override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + val arrayClass = classOf[GenericArrayData].getName + nullSafeCodeGen(ctx, ev, (str, pattern) => + // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. + s"""${ev.primitive} = new $arrayClass($str.split($pattern, -1));""") + } + + override def prettyName: String = "split" +} + + +/** + * Replace all substrings of str that match regexp with rep. + * + * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status. + */ +case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expression) + extends TernaryExpression with ImplicitCastInputTypes { + + // last regex in string, we will update the pattern iff regexp value changed. + @transient private var lastRegex: UTF8String = _ + // last regex pattern, we cache it for performance concern + @transient private var pattern: Pattern = _ + // last replacement string, we don't want to convert a UTF8String => java.langString every time. + @transient private var lastReplacement: String = _ + @transient private var lastReplacementInUTF8: UTF8String = _ + // result buffer write by Matcher + @transient private val result: StringBuffer = new StringBuffer + + override def nullSafeEval(s: Any, p: Any, r: Any): Any = { + if (!p.equals(lastRegex)) { + // regex value changed + lastRegex = p.asInstanceOf[UTF8String].clone() + pattern = Pattern.compile(lastRegex.toString) + } + if (!r.equals(lastReplacementInUTF8)) { + // replacement string changed + lastReplacementInUTF8 = r.asInstanceOf[UTF8String].clone() + lastReplacement = lastReplacementInUTF8.toString + } + val m = pattern.matcher(s.toString()) + result.delete(0, result.length()) + + while (m.find) { + m.appendReplacement(result, lastReplacement) + } + m.appendTail(result) + + UTF8String.fromString(result.toString) + } + + override def dataType: DataType = StringType + override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType) + override def children: Seq[Expression] = subject :: regexp :: rep :: Nil + override def prettyName: String = "regexp_replace" + + override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + val termLastRegex = ctx.freshName("lastRegex") + val termPattern = ctx.freshName("pattern") + + val termLastReplacement = ctx.freshName("lastReplacement") + val termLastReplacementInUTF8 = ctx.freshName("lastReplacementInUTF8") + + val termResult = ctx.freshName("result") + + val classNamePattern = classOf[Pattern].getCanonicalName + val classNameStringBuffer = classOf[java.lang.StringBuffer].getCanonicalName + + ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;") + ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;") + ctx.addMutableState("String", termLastReplacement, s"${termLastReplacement} = null;") + ctx.addMutableState("UTF8String", + termLastReplacementInUTF8, s"${termLastReplacementInUTF8} = null;") + ctx.addMutableState(classNameStringBuffer, + termResult, s"${termResult} = new $classNameStringBuffer();") + + nullSafeCodeGen(ctx, ev, (subject, regexp, rep) => { + s""" + if (!$regexp.equals(${termLastRegex})) { + // regex value changed + ${termLastRegex} = $regexp.clone(); + ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString()); + } + if (!$rep.equals(${termLastReplacementInUTF8})) { + // replacement string changed + ${termLastReplacementInUTF8} = $rep.clone(); + ${termLastReplacement} = ${termLastReplacementInUTF8}.toString(); + } + ${termResult}.delete(0, ${termResult}.length()); + java.util.regex.Matcher m = ${termPattern}.matcher($subject.toString()); + + while (m.find()) { + m.appendReplacement(${termResult}, ${termLastReplacement}); + } + m.appendTail(${termResult}); + ${ev.primitive} = UTF8String.fromString(${termResult}.toString()); + ${ev.isNull} = false; + """ + }) + } +} + +/** + * Extract a specific(idx) group identified by a Java regex. + * + * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status. + */ +case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expression) + extends TernaryExpression with ImplicitCastInputTypes { + def this(s: Expression, r: Expression) = this(s, r, Literal(1)) + + // last regex in string, we will update the pattern iff regexp value changed. + @transient private var lastRegex: UTF8String = _ + // last regex pattern, we cache it for performance concern + @transient private var pattern: Pattern = _ + + override def nullSafeEval(s: Any, p: Any, r: Any): Any = { + if (!p.equals(lastRegex)) { + // regex value changed + lastRegex = p.asInstanceOf[UTF8String].clone() + pattern = Pattern.compile(lastRegex.toString) + } + val m = pattern.matcher(s.toString) + if (m.find) { + val mr: MatchResult = m.toMatchResult + UTF8String.fromString(mr.group(r.asInstanceOf[Int])) + } else { + UTF8String.EMPTY_UTF8 + } + } + + override def dataType: DataType = StringType + override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType) + override def children: Seq[Expression] = subject :: regexp :: idx :: Nil + override def prettyName: String = "regexp_extract" + + override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + val termLastRegex = ctx.freshName("lastRegex") + val termPattern = ctx.freshName("pattern") + val classNamePattern = classOf[Pattern].getCanonicalName + + ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;") + ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;") + + nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => { + s""" + if (!$regexp.equals(${termLastRegex})) { + // regex value changed + ${termLastRegex} = $regexp.clone(); + ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString()); + } + java.util.regex.Matcher m = + ${termPattern}.matcher($subject.toString()); + if (m.find()) { + java.util.regex.MatchResult mr = m.toMatchResult(); + ${ev.primitive} = UTF8String.fromString(mr.group($idx)); + ${ev.isNull} = false; + } else { + ${ev.primitive} = UTF8String.EMPTY_UTF8; + ${ev.isNull} = false; + }""" + }) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala index d04434b953e4..017efd2a166a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala @@ -19,8 +19,142 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.types.{Decimal, DataType, StructType, AtomicType} -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} + +/** + * An extended version of [[InternalRow]] that implements all special getters, toString + * and equals/hashCode by `genericGet`. + */ +trait BaseGenericInternalRow extends InternalRow { + + protected def genericGet(ordinal: Int): Any + + // default implementation (slow) + private def getAs[T](ordinal: Int) = genericGet(ordinal).asInstanceOf[T] + override def isNullAt(ordinal: Int): Boolean = getAs[AnyRef](ordinal) eq null + override def get(ordinal: Int, dataType: DataType): AnyRef = getAs(ordinal) + override def getBoolean(ordinal: Int): Boolean = getAs(ordinal) + override def getByte(ordinal: Int): Byte = getAs(ordinal) + override def getShort(ordinal: Int): Short = getAs(ordinal) + override def getInt(ordinal: Int): Int = getAs(ordinal) + override def getLong(ordinal: Int): Long = getAs(ordinal) + override def getFloat(ordinal: Int): Float = getAs(ordinal) + override def getDouble(ordinal: Int): Double = getAs(ordinal) + override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = getAs(ordinal) + override def getUTF8String(ordinal: Int): UTF8String = getAs(ordinal) + override def getBinary(ordinal: Int): Array[Byte] = getAs(ordinal) + override def getArray(ordinal: Int): ArrayData = getAs(ordinal) + override def getInterval(ordinal: Int): CalendarInterval = getAs(ordinal) + override def getMap(ordinal: Int): MapData = getAs(ordinal) + override def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs(ordinal) + + override def anyNull: Boolean = { + val len = numFields + var i = 0 + while (i < len) { + if (isNullAt(i)) { return true } + i += 1 + } + false + } + + override def toString: String = { + if (numFields == 0) { + "[empty row]" + } else { + val sb = new StringBuilder + sb.append("[") + sb.append(genericGet(0)) + val len = numFields + var i = 1 + while (i < len) { + sb.append(",") + sb.append(genericGet(i)) + i += 1 + } + sb.append("]") + sb.toString() + } + } + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[BaseGenericInternalRow]) { + return false + } + + val other = o.asInstanceOf[BaseGenericInternalRow] + if (other eq null) { + return false + } + + val len = numFields + if (len != other.numFields) { + return false + } + + var i = 0 + while (i < len) { + if (isNullAt(i) != other.isNullAt(i)) { + return false + } + if (!isNullAt(i)) { + val o1 = genericGet(i) + val o2 = other.genericGet(i) + o1 match { + case b1: Array[Byte] => + if (!o2.isInstanceOf[Array[Byte]] || + !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) { + return false + } + case f1: Float if java.lang.Float.isNaN(f1) => + if (!o2.isInstanceOf[Float] || ! java.lang.Float.isNaN(o2.asInstanceOf[Float])) { + return false + } + case d1: Double if java.lang.Double.isNaN(d1) => + if (!o2.isInstanceOf[Double] || ! java.lang.Double.isNaN(o2.asInstanceOf[Double])) { + return false + } + case _ => if (o1 != o2) { + return false + } + } + } + i += 1 + } + true + } + + // Custom hashCode function that matches the efficient code generated version. + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numFields + while (i < len) { + val update: Int = + if (isNullAt(i)) { + 0 + } else { + genericGet(i) match { + case b: Boolean => if (b) 0 else 1 + case b: Byte => b.toInt + case s: Short => s.toInt + case i: Int => i + case l: Long => (l ^ (l >>> 32)).toInt + case f: Float => java.lang.Float.floatToIntBits(f) + case d: Double => + val b = java.lang.Double.doubleToLongBits(d) + (b ^ (b >>> 32)).toInt + case a: Array[Byte] => java.util.Arrays.hashCode(a) + case other => other.hashCode() + } + } + result = 37 * result + update + i += 1 + } + result + } +} /** * An extended interface to [[InternalRow]] that allows the values for each column to be updated. @@ -39,6 +173,13 @@ abstract class MutableRow extends InternalRow { def setLong(i: Int, value: Long): Unit = { update(i, value) } def setFloat(i: Int, value: Float): Unit = { update(i, value) } def setDouble(i: Int, value: Double): Unit = { update(i, value) } + + /** + * Update the decimal column at `i`. + * + * Note: In order to support update decimal with precision > 18 in UnsafeRow, + * CAN NOT call setNullAt() for decimal column on UnsafeRow, call setDecimal(i, null, precision). + */ def setDecimal(i: Int, value: Decimal, precision: Int) { update(i, value) } } @@ -76,15 +217,15 @@ class GenericRowWithSchema(values: Array[Any], override val schema: StructType) * Note that, while the array is not copied, and thus could technically be mutated after creation, * this is not allowed. */ -class GenericInternalRow(protected[sql] val values: Array[Any]) extends InternalRow { +class GenericInternalRow(private[sql] val values: Array[Any]) extends BaseGenericInternalRow { /** No-arg constructor for serialization. */ protected def this() = this(null) def this(size: Int) = this(new Array[Any](size)) - override def genericGet(ordinal: Int): Any = values(ordinal) + override protected def genericGet(ordinal: Int) = values(ordinal) - override def toSeq: Seq[Any] = values + override def toSeq(fieldTypes: Seq[DataType]): Seq[Any] = values override def numFields: Int = values.length @@ -103,15 +244,15 @@ class GenericInternalRowWithSchema(values: Array[Any], val schema: StructType) def fieldIndex(name: String): Int = schema.fieldIndex(name) } -class GenericMutableRow(val values: Array[Any]) extends MutableRow { +class GenericMutableRow(values: Array[Any]) extends MutableRow with BaseGenericInternalRow { /** No-arg constructor for serialization. */ protected def this() = this(null) def this(size: Int) = this(new Array[Any](size)) - override def genericGet(ordinal: Int): Any = values(ordinal) + override protected def genericGet(ordinal: Int) = values(ordinal) - override def toSeq: Seq[Any] = values + override def toSeq(fieldTypes: Seq[DataType]): Seq[Any] = values override def numFields: Int = values.length diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala similarity index 72% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 56225290cd6b..3c23f2ecfb57 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -19,14 +19,11 @@ package org.apache.spark.sql.catalyst.expressions import java.text.DecimalFormat import java.util.Arrays +import java.util.{Map => JMap, HashMap} import java.util.Locale -import java.util.regex.{MatchResult, Pattern} - -import org.apache.commons.lang3.StringEscapeUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.catalyst.util.StringUtils import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -123,143 +120,6 @@ case class ConcatWs(children: Seq[Expression]) } } - -trait StringRegexExpression extends ImplicitCastInputTypes { - self: BinaryExpression => - - def escape(v: String): String - def matches(regex: Pattern, str: String): Boolean - - override def dataType: DataType = BooleanType - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) - - // try cache the pattern for Literal - private lazy val cache: Pattern = right match { - case x @ Literal(value: String, StringType) => compile(value) - case _ => null - } - - protected def compile(str: String): Pattern = if (str == null) { - null - } else { - // Let it raise exception if couldn't compile the regex string - Pattern.compile(escape(str)) - } - - protected def pattern(str: String) = if (cache == null) compile(str) else cache - - protected override def nullSafeEval(input1: Any, input2: Any): Any = { - val regex = pattern(input2.asInstanceOf[UTF8String].toString()) - if(regex == null) { - null - } else { - matches(regex, input1.asInstanceOf[UTF8String].toString()) - } - } -} - -/** - * Simple RegEx pattern matching function - */ -case class Like(left: Expression, right: Expression) - extends BinaryExpression with StringRegexExpression with CodegenFallback { - - override def escape(v: String): String = StringUtils.escapeLikeRegex(v) - - override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches() - - override def toString: String = s"$left LIKE $right" - - override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { - val patternClass = classOf[Pattern].getName - val escapeFunc = StringUtils.getClass.getName.stripSuffix("$") + ".escapeLikeRegex" - val pattern = ctx.freshName("pattern") - - if (right.foldable) { - val rVal = right.eval() - if (rVal != null) { - val regexStr = - StringEscapeUtils.escapeJava(escape(rVal.asInstanceOf[UTF8String].toString())) - ctx.addMutableState(patternClass, pattern, - s"""$pattern = ${patternClass}.compile("$regexStr");""") - - // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again. - val eval = left.gen(ctx) - s""" - ${eval.code} - boolean ${ev.isNull} = ${eval.isNull}; - ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; - if (!${ev.isNull}) { - ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).matches(); - } - """ - } else { - s""" - boolean ${ev.isNull} = true; - ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; - """ - } - } else { - nullSafeCodeGen(ctx, ev, (eval1, eval2) => { - s""" - String rightStr = ${eval2}.toString(); - ${patternClass} $pattern = ${patternClass}.compile($escapeFunc(rightStr)); - ${ev.primitive} = $pattern.matcher(${eval1}.toString()).matches(); - """ - }) - } - } -} - - -case class RLike(left: Expression, right: Expression) - extends BinaryExpression with StringRegexExpression with CodegenFallback { - - override def escape(v: String): String = v - override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0) - override def toString: String = s"$left RLIKE $right" - - override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { - val patternClass = classOf[Pattern].getName - val pattern = ctx.freshName("pattern") - - if (right.foldable) { - val rVal = right.eval() - if (rVal != null) { - val regexStr = - StringEscapeUtils.escapeJava(rVal.asInstanceOf[UTF8String].toString()) - ctx.addMutableState(patternClass, pattern, - s"""$pattern = ${patternClass}.compile("$regexStr");""") - - // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again. - val eval = left.gen(ctx) - s""" - ${eval.code} - boolean ${ev.isNull} = ${eval.isNull}; - ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; - if (!${ev.isNull}) { - ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).find(0); - } - """ - } else { - s""" - boolean ${ev.isNull} = true; - ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; - """ - } - } else { - nullSafeCodeGen(ctx, ev, (eval1, eval2) => { - s""" - String rightStr = ${eval2}.toString(); - ${patternClass} $pattern = ${patternClass}.compile(rightStr); - ${ev.primitive} = $pattern.matcher(${eval1}.toString()).find(0); - """ - }) - } - } -} - - trait String2StringExpression extends ImplicitCastInputTypes { self: UnaryExpression => @@ -304,7 +164,7 @@ case class Lower(child: Expression) extends UnaryExpression with String2StringEx } /** A base trait for functions that compare two strings, returning a boolean. */ -trait StringComparison extends ImplicitCastInputTypes { +trait StringPredicate extends Predicate with ImplicitCastInputTypes { self: BinaryExpression => def compare(l: UTF8String, r: UTF8String): Boolean @@ -321,7 +181,7 @@ trait StringComparison extends ImplicitCastInputTypes { * A function that returns true if the string `left` contains the string `right`. */ case class Contains(left: Expression, right: Expression) - extends BinaryExpression with Predicate with StringComparison { + extends BinaryExpression with StringPredicate { override def compare(l: UTF8String, r: UTF8String): Boolean = l.contains(r) override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { defineCodeGen(ctx, ev, (c1, c2) => s"($c1).contains($c2)") @@ -332,7 +192,7 @@ case class Contains(left: Expression, right: Expression) * A function that returns true if the string `left` starts with the string `right`. */ case class StartsWith(left: Expression, right: Expression) - extends BinaryExpression with Predicate with StringComparison { + extends BinaryExpression with StringPredicate { override def compare(l: UTF8String, r: UTF8String): Boolean = l.startsWith(r) override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { defineCodeGen(ctx, ev, (c1, c2) => s"($c1).startsWith($c2)") @@ -343,13 +203,110 @@ case class StartsWith(left: Expression, right: Expression) * A function that returns true if the string `left` ends with the string `right`. */ case class EndsWith(left: Expression, right: Expression) - extends BinaryExpression with Predicate with StringComparison { + extends BinaryExpression with StringPredicate { override def compare(l: UTF8String, r: UTF8String): Boolean = l.endsWith(r) override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { defineCodeGen(ctx, ev, (c1, c2) => s"($c1).endsWith($c2)") } } +object StringTranslate { + + def buildDict(matchingString: UTF8String, replaceString: UTF8String) + : JMap[Character, Character] = { + val matching = matchingString.toString() + val replace = replaceString.toString() + val dict = new HashMap[Character, Character]() + var i = 0 + while (i < matching.length()) { + val rep = if (i < replace.length()) replace.charAt(i) else '\0' + if (null == dict.get(matching.charAt(i))) { + dict.put(matching.charAt(i), rep) + } + i += 1 + } + dict + } +} + +/** + * A function translate any character in the `srcExpr` by a character in `replaceExpr`. + * The characters in `replaceExpr` is corresponding to the characters in `matchingExpr`. + * The translate will happen when any character in the string matching with the character + * in the `matchingExpr`. + */ +case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replaceExpr: Expression) + extends TernaryExpression with ImplicitCastInputTypes { + + @transient private var lastMatching: UTF8String = _ + @transient private var lastReplace: UTF8String = _ + @transient private var dict: JMap[Character, Character] = _ + + override def nullSafeEval(srcEval: Any, matchingEval: Any, replaceEval: Any): Any = { + if (matchingEval != lastMatching || replaceEval != lastReplace) { + lastMatching = matchingEval.asInstanceOf[UTF8String].clone() + lastReplace = replaceEval.asInstanceOf[UTF8String].clone() + dict = StringTranslate.buildDict(lastMatching, lastReplace) + } + srcEval.asInstanceOf[UTF8String].translate(dict) + } + + override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + val termLastMatching = ctx.freshName("lastMatching") + val termLastReplace = ctx.freshName("lastReplace") + val termDict = ctx.freshName("dict") + val classNameDict = classOf[JMap[Character, Character]].getCanonicalName + + ctx.addMutableState("UTF8String", termLastMatching, s"${termLastMatching} = null;") + ctx.addMutableState("UTF8String", termLastReplace, s"${termLastReplace} = null;") + ctx.addMutableState(classNameDict, termDict, s"${termDict} = null;") + + nullSafeCodeGen(ctx, ev, (src, matching, replace) => { + val check = if (matchingExpr.foldable && replaceExpr.foldable) { + s"${termDict} == null" + } else { + s"!${matching}.equals(${termLastMatching}) || !${replace}.equals(${termLastReplace})" + } + s"""if ($check) { + // Not all of them is literal or matching or replace value changed + ${termLastMatching} = ${matching}.clone(); + ${termLastReplace} = ${replace}.clone(); + ${termDict} = org.apache.spark.sql.catalyst.expressions.StringTranslate + .buildDict(${termLastMatching}, ${termLastReplace}); + } + ${ev.primitive} = ${src}.translate(${termDict}); + """ + }) + } + + override def dataType: DataType = StringType + override def inputTypes: Seq[DataType] = Seq(StringType, StringType, StringType) + override def children: Seq[Expression] = srcExpr :: matchingExpr :: replaceExpr :: Nil + override def prettyName: String = "translate" +} + +/** + * A function that returns the index (1-based) of the given string (left) in the comma- + * delimited list (right). Returns 0, if the string wasn't found or if the given + * string (left) contains a comma. + */ +case class FindInSet(left: Expression, right: Expression) extends BinaryExpression + with ImplicitCastInputTypes { + + override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + + override protected def nullSafeEval(word: Any, set: Any): Any = + set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) + + override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + nullSafeCodeGen(ctx, ev, (word, set) => + s"${ev.primitive} = $set.findInSet($word);" + ) + } + + override def dataType: DataType = IntegerType +} + /** * A function that trim the spaces from both ends for the specified string. */ @@ -671,32 +628,6 @@ case class StringSpace(child: Expression) override def prettyName: String = "space" } -/** - * Splits str around pat (pattern is a regular expression). - */ -case class StringSplit(str: Expression, pattern: Expression) - extends BinaryExpression with ImplicitCastInputTypes { - - override def left: Expression = str - override def right: Expression = pattern - override def dataType: DataType = ArrayType(StringType) - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) - - override def nullSafeEval(string: Any, regex: Any): Any = { - val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1) - new GenericArrayData(strings.asInstanceOf[Array[Any]]) - } - - override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { - val arrayClass = classOf[GenericArrayData].getName - nullSafeCodeGen(ctx, ev, (str, pattern) => - // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. - s"""${ev.primitive} = new $arrayClass($str.split($pattern, -1));""") - } - - override def prettyName: String = "split" -} - object Substring { def subStringBinarySQL(bytes: Array[Byte], pos: Int, len: Int): Array[Byte] = { if (pos > bytes.length) { @@ -915,7 +846,7 @@ case class Decode(bin: Expression, charset: Expression) try { ${ev.primitive} = UTF8String.fromString(new String($bytes, $charset.toString())); } catch (java.io.UnsupportedEncodingException e) { - org.apache.spark.unsafe.PlatformDependent.throwException(e); + org.apache.spark.unsafe.Platform.throwException(e); } """) } @@ -945,168 +876,11 @@ case class Encode(value: Expression, charset: Expression) try { ${ev.primitive} = $string.toString().getBytes($charset.toString()); } catch (java.io.UnsupportedEncodingException e) { - org.apache.spark.unsafe.PlatformDependent.throwException(e); + org.apache.spark.unsafe.Platform.throwException(e); }""") } } -/** - * Replace all substrings of str that match regexp with rep. - * - * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status. - */ -case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expression) - extends TernaryExpression with ImplicitCastInputTypes { - - // last regex in string, we will update the pattern iff regexp value changed. - @transient private var lastRegex: UTF8String = _ - // last regex pattern, we cache it for performance concern - @transient private var pattern: Pattern = _ - // last replacement string, we don't want to convert a UTF8String => java.langString every time. - @transient private var lastReplacement: String = _ - @transient private var lastReplacementInUTF8: UTF8String = _ - // result buffer write by Matcher - @transient private val result: StringBuffer = new StringBuffer - - override def nullSafeEval(s: Any, p: Any, r: Any): Any = { - if (!p.equals(lastRegex)) { - // regex value changed - lastRegex = p.asInstanceOf[UTF8String].clone() - pattern = Pattern.compile(lastRegex.toString) - } - if (!r.equals(lastReplacementInUTF8)) { - // replacement string changed - lastReplacementInUTF8 = r.asInstanceOf[UTF8String].clone() - lastReplacement = lastReplacementInUTF8.toString - } - val m = pattern.matcher(s.toString()) - result.delete(0, result.length()) - - while (m.find) { - m.appendReplacement(result, lastReplacement) - } - m.appendTail(result) - - UTF8String.fromString(result.toString) - } - - override def dataType: DataType = StringType - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType) - override def children: Seq[Expression] = subject :: regexp :: rep :: Nil - override def prettyName: String = "regexp_replace" - - override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { - val termLastRegex = ctx.freshName("lastRegex") - val termPattern = ctx.freshName("pattern") - - val termLastReplacement = ctx.freshName("lastReplacement") - val termLastReplacementInUTF8 = ctx.freshName("lastReplacementInUTF8") - - val termResult = ctx.freshName("result") - - val classNamePattern = classOf[Pattern].getCanonicalName - val classNameStringBuffer = classOf[java.lang.StringBuffer].getCanonicalName - - ctx.addMutableState("UTF8String", - termLastRegex, s"${termLastRegex} = null;") - ctx.addMutableState(classNamePattern, - termPattern, s"${termPattern} = null;") - ctx.addMutableState("String", - termLastReplacement, s"${termLastReplacement} = null;") - ctx.addMutableState("UTF8String", - termLastReplacementInUTF8, s"${termLastReplacementInUTF8} = null;") - ctx.addMutableState(classNameStringBuffer, - termResult, s"${termResult} = new $classNameStringBuffer();") - - nullSafeCodeGen(ctx, ev, (subject, regexp, rep) => { - s""" - if (!$regexp.equals(${termLastRegex})) { - // regex value changed - ${termLastRegex} = $regexp.clone(); - ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString()); - } - if (!$rep.equals(${termLastReplacementInUTF8})) { - // replacement string changed - ${termLastReplacementInUTF8} = $rep.clone(); - ${termLastReplacement} = ${termLastReplacementInUTF8}.toString(); - } - ${termResult}.delete(0, ${termResult}.length()); - java.util.regex.Matcher m = ${termPattern}.matcher($subject.toString()); - - while (m.find()) { - m.appendReplacement(${termResult}, ${termLastReplacement}); - } - m.appendTail(${termResult}); - ${ev.primitive} = UTF8String.fromString(${termResult}.toString()); - ${ev.isNull} = false; - """ - }) - } -} - -/** - * Extract a specific(idx) group identified by a Java regex. - * - * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status. - */ -case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expression) - extends TernaryExpression with ImplicitCastInputTypes { - def this(s: Expression, r: Expression) = this(s, r, Literal(1)) - - // last regex in string, we will update the pattern iff regexp value changed. - @transient private var lastRegex: UTF8String = _ - // last regex pattern, we cache it for performance concern - @transient private var pattern: Pattern = _ - - override def nullSafeEval(s: Any, p: Any, r: Any): Any = { - if (!p.equals(lastRegex)) { - // regex value changed - lastRegex = p.asInstanceOf[UTF8String].clone() - pattern = Pattern.compile(lastRegex.toString) - } - val m = pattern.matcher(s.toString()) - if (m.find) { - val mr: MatchResult = m.toMatchResult - UTF8String.fromString(mr.group(r.asInstanceOf[Int])) - } else { - UTF8String.EMPTY_UTF8 - } - } - - override def dataType: DataType = StringType - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType) - override def children: Seq[Expression] = subject :: regexp :: idx :: Nil - override def prettyName: String = "regexp_extract" - - override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { - val termLastRegex = ctx.freshName("lastRegex") - val termPattern = ctx.freshName("pattern") - val classNamePattern = classOf[Pattern].getCanonicalName - - ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;") - ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;") - - nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => { - s""" - if (!$regexp.equals(${termLastRegex})) { - // regex value changed - ${termLastRegex} = $regexp.clone(); - ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString()); - } - java.util.regex.Matcher m = - ${termPattern}.matcher($subject.toString()); - if (m.find()) { - java.util.regex.MatchResult mr = m.toMatchResult(); - ${ev.primitive} = UTF8String.fromString(mr.group($idx)); - ${ev.isNull} = false; - } else { - ${ev.primitive} = UTF8String.EMPTY_UTF8; - ${ev.isNull} = false; - }""" - }) - } -} - /** * Formats the number X to a format like '#,###,###.##', rounded to D decimal places, * and returns the result as a string. If D is 0, the result has no decimal point or @@ -1208,8 +982,8 @@ case class FormatNumber(x: Expression, d: Expression) $df $dFormat = new $df($pattern.toString()); $lastDValue = $d; $numberFormat.applyPattern($dFormat.toPattern()); - ${ev.primitive} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); } + ${ev.primitive} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); } else { ${ev.primitive} = null; ${ev.isNull} = true; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index e4b6294dc7b8..a430000bef65 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.optimizer import scala.collection.immutable.HashSet -import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries +import org.apache.spark.sql.catalyst.analysis.{CleanupAliases, EliminateSubQueries} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.catalyst.plans.FullOuter @@ -31,14 +31,8 @@ import org.apache.spark.sql.types._ abstract class Optimizer extends RuleExecutor[LogicalPlan] -class DefaultOptimizer extends Optimizer { - - /** - * Override to provide additional rules for the "Operator Optimizations" batch. - */ - val extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] = Nil - - lazy val batches = +object DefaultOptimizer extends Optimizer { + val batches = // SubQueries are only needed for analysis and can be removed before execution. Batch("Remove SubQueries", FixedPoint(100), EliminateSubQueries) :: @@ -47,27 +41,26 @@ class DefaultOptimizer extends Optimizer { RemoveLiteralFromGroupExpressions) :: Batch("Operator Optimizations", FixedPoint(100), // Operator push down - SetOperationPushDown :: - SamplePushDown :: - PushPredicateThroughJoin :: - PushPredicateThroughProject :: - PushPredicateThroughGenerate :: - ColumnPruning :: + SetOperationPushDown, + SamplePushDown, + PushPredicateThroughJoin, + PushPredicateThroughProject, + PushPredicateThroughGenerate, + ColumnPruning, // Operator combine - ProjectCollapsing :: - CombineFilters :: - CombineLimits :: + ProjectCollapsing, + CombineFilters, + CombineLimits, // Constant folding - NullPropagation :: - OptimizeIn :: - ConstantFolding :: - LikeSimplification :: - BooleanSimplification :: - RemovePositive :: - SimplifyFilters :: - SimplifyCasts :: - SimplifyCaseConversionExpressions :: - extendedOperatorOptimizationRules.toList : _*) :: + NullPropagation, + OptimizeIn, + ConstantFolding, + LikeSimplification, + BooleanSimplification, + RemovePositive, + SimplifyFilters, + SimplifyCasts, + SimplifyCaseConversionExpressions) :: Batch("Decimal Optimizations", FixedPoint(100), DecimalAggregates) :: Batch("LocalRelation", FixedPoint(100), @@ -172,6 +165,7 @@ object SetOperationPushDown extends Rule[LogicalPlan] { * * - Inserting Projections beneath the following operators: * - Aggregate + * - Generate * - Project <- Join * - LeftSemiJoin */ @@ -185,6 +179,21 @@ object ColumnPruning extends Rule[LogicalPlan] { case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty => a.copy(child = Project(a.references.toSeq, child)) + // Eliminate attributes that are not needed to calculate the Generate. + case g: Generate if !g.join && (g.child.outputSet -- g.references).nonEmpty => + g.copy(child = Project(g.references.toSeq, g.child)) + + case p @ Project(_, g: Generate) if g.join && p.references.subsetOf(g.generatedSet) => + p.copy(child = g.copy(join = false)) + + case p @ Project(projectList, g: Generate) if g.join => + val neededChildOutput = p.references -- g.generatorOutput ++ g.references + if (neededChildOutput == g.child.outputSet) { + p + } else { + Project(projectList, g.copy(child = Project(neededChildOutput.toSeq, g.child))) + } + case p @ Project(projectList, a @ Aggregate(groupingExpressions, aggregateExpressions, child)) if (a.outputSet -- p.references).nonEmpty => Project( @@ -229,18 +238,12 @@ object ColumnPruning extends Rule[LogicalPlan] { } /** Applies a projection only when the child is producing unnecessary attributes */ - private def prunedChild(c: LogicalPlan, allReferences: AttributeSet) = { + private def prunedChild(c: LogicalPlan, allReferences: AttributeSet) = if ((c.outputSet -- allReferences.filter(c.outputSet.contains)).nonEmpty) { - // We need to preserve the nullability of c's output. - // So, we first create a outputMap and if a reference is from the output of - // c, we use that output attribute from c. - val outputMap = AttributeMap(c.output.map(attr => (attr, attr))) - val projectList = allReferences.filter(outputMap.contains).map(outputMap).toSeq - Project(projectList, c) + Project(allReferences.filter(c.outputSet.contains).toSeq, c) } else { c } - } } /** @@ -273,8 +276,11 @@ object ProjectCollapsing extends Rule[LogicalPlan] { val substitutedProjection = projectList1.map(_.transform { case a: Attribute => aliasMap.getOrElse(a, a) }).asInstanceOf[Seq[NamedExpression]] - - Project(substitutedProjection, child) + // collapse 2 projects may introduce unnecessary Aliases, trim them here. + val cleanedProjection = substitutedProjection.map(p => + CleanupAliases.trimNonTopLevelAliases(p).asInstanceOf[NamedExpression] + ) + Project(cleanedProjection, child) } } } @@ -366,7 +372,7 @@ object NullPropagation extends Rule[LogicalPlan] { case _ => e } - case e: StringComparison => e.children match { + case e: StringPredicate => e.children match { case Literal(null, _) :: right :: Nil => Literal.create(null, e.dataType) case left :: Literal(null, _) :: Nil => Literal.create(null, e.dataType) case _ => e @@ -389,12 +395,6 @@ object ConstantFolding extends Rule[LogicalPlan] { // Fold expressions that are foldable. case e if e.foldable => Literal.create(e.eval(EmptyRow), e.dataType) - - // Fold "literal in (item1, item2, ..., literal, ...)" into true directly. - case In(Literal(v, _), list) if list.exists { - case Literal(candidate, _) if candidate == v => true - case _ => false - } => Literal.create(true, BooleanType) } } } @@ -406,7 +406,7 @@ object ConstantFolding extends Rule[LogicalPlan] { object OptimizeIn extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transform { case q: LogicalPlan => q transformExpressionsDown { - case In(v, list) if !list.exists(!_.isInstanceOf[Literal]) => + case In(v, list) if !list.exists(!_.isInstanceOf[Literal]) && list.size > 10 => val hSet = list.map(e => e.eval(EmptyRow)) InSet(v, HashSet() ++ hSet) } @@ -530,13 +530,6 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper { */ object CombineFilters extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transform { - case Filter(Not(AtLeastNNulls(1, e1)), Filter(Not(AtLeastNNulls(1, e2)), grandChild)) => - // If we are combining two expressions Not(AtLeastNNulls(1, e1)) and - // Not(AtLeastNNulls(1, e2)) - // (this is used to make sure there is no null in the result of e1 and e2 and - // they are added by FilterNullsInJoinKey optimziation rule), we can - // just create a Not(AtLeastNNulls(1, (e1 ++ e2).distinct)). - Filter(Not(AtLeastNNulls(1, (e1 ++ e2).distinct)), grandChild) case ff @ Filter(fc, nf @ Filter(nc, grandChild)) => Filter(And(nc, fc), grandChild) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala index c610f70d3843..55286f9f2fc5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.plans import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, VirtualColumn} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.types.{DataType, StructType} @@ -92,7 +93,7 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy val newArgs = productIterator.map(recursiveTransform).toArray - if (changed) makeCopy(newArgs) else this + if (changed) makeCopy(newArgs).asInstanceOf[this.type] else this } /** @@ -124,7 +125,7 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy val newArgs = productIterator.map(recursiveTransform).toArray - if (changed) makeCopy(newArgs) else this + if (changed) makeCopy(newArgs).asInstanceOf[this.type] else this } /** Returns the result of running [[transformExpressions]] on this node diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index bedeaf06adf1..9bb466ac2d29 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -22,11 +22,60 @@ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.trees.TreeNode +import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, TreeNode} abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging { + private var _analyzed: Boolean = false + + /** + * Marks this plan as already analyzed. This should only be called by CheckAnalysis. + */ + private[catalyst] def setAnalyzed(): Unit = { _analyzed = true } + + /** + * Returns true if this node and its children have already been gone through analysis and + * verification. Note that this is only an optimization used to avoid analyzing trees that + * have already been analyzed, and can be reset by transformations. + */ + def analyzed: Boolean = _analyzed + + /** + * Returns a copy of this node where `rule` has been recursively applied first to all of its + * children and then itself (post-order). When `rule` does not apply to a given node, it is left + * unchanged. This function is similar to `transformUp`, but skips sub-trees that have already + * been marked as analyzed. + * + * @param rule the function use to transform this nodes children + */ + def resolveOperators(rule: PartialFunction[LogicalPlan, LogicalPlan]): LogicalPlan = { + if (!analyzed) { + val afterRuleOnChildren = transformChildren(rule, (t, r) => t.resolveOperators(r)) + if (this fastEquals afterRuleOnChildren) { + CurrentOrigin.withOrigin(origin) { + rule.applyOrElse(this, identity[LogicalPlan]) + } + } else { + CurrentOrigin.withOrigin(origin) { + rule.applyOrElse(afterRuleOnChildren, identity[LogicalPlan]) + } + } + } else { + this + } + } + + /** + * Recursively transforms the expressions of a tree, skipping nodes that have already + * been analyzed. + */ + def resolveExpressions(r: PartialFunction[Expression, Expression]): LogicalPlan = { + this resolveOperators { + case p => p.transformExpressions(r) + } + } + /** * Computes [[Statistics]] for this plan. The default implementation assumes the output * cardinality is the product of of all child plan's cardinality, i.e. applies in the case @@ -130,47 +179,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging { def resolveQuoted( name: String, resolver: Resolver): Option[NamedExpression] = { - resolve(parseAttributeName(name), output, resolver) - } - - /** - * Internal method, used to split attribute name by dot with backticks rule. - * Backticks must appear in pairs, and the quoted string must be a complete name part, - * which means `ab..c`e.f is not allowed. - * Escape character is not supported now, so we can't use backtick inside name part. - */ - private def parseAttributeName(name: String): Seq[String] = { - val e = new AnalysisException(s"syntax error in attribute name: $name") - val nameParts = scala.collection.mutable.ArrayBuffer.empty[String] - val tmp = scala.collection.mutable.ArrayBuffer.empty[Char] - var inBacktick = false - var i = 0 - while (i < name.length) { - val char = name(i) - if (inBacktick) { - if (char == '`') { - inBacktick = false - if (i + 1 < name.length && name(i + 1) != '.') throw e - } else { - tmp += char - } - } else { - if (char == '`') { - if (tmp.nonEmpty) throw e - inBacktick = true - } else if (char == '.') { - if (name(i - 1) == '.' || i == name.length - 1) throw e - nameParts += tmp.mkString - tmp.clear() - } else { - tmp += char - } - } - i += 1 - } - if (inBacktick) throw e - nameParts += tmp.mkString - nameParts.toSeq + resolve(UnresolvedAttribute.parseAttributeName(name), output, resolver) } /** @@ -250,13 +259,13 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging { // One match, but we also need to extract the requested nested field. case Seq((a, nestedFields)) => // The foldLeft adds ExtractValues for every remaining parts of the identifier, - // and wrap it with UnresolvedAlias which will be removed later. + // and aliased it with the last part of the name. // For example, consider "a.b.c", where "a" is resolved to an existing attribute. - // Then this will add ExtractValue("c", ExtractValue("b", a)), and wrap it as - // UnresolvedAlias(ExtractValue("c", ExtractValue("b", a))). + // Then this will add ExtractValue("c", ExtractValue("b", a)), and alias the final + // expression as "c". val fieldExprs = nestedFields.foldLeft(a: Expression)((expr, fieldName) => ExtractValue(expr, Literal(fieldName), resolver)) - Some(UnresolvedAlias(fieldExprs)) + Some(Alias(fieldExprs, nestedFields.last)()) // No matches. case Seq() => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala index 54b5f4977266..722f69cdca82 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala @@ -34,7 +34,7 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extend }.nonEmpty ) - expressions.forall(_.resolved) && childrenResolved && !hasSpecialExpressions + !expressions.exists(!_.resolved) && childrenResolved && !hasSpecialExpressions } } @@ -68,7 +68,7 @@ case class Generate( generator.resolved && childrenResolved && generator.elementTypes.length == generatorOutput.length && - generatorOutput.forall(_.resolved) + !generatorOutput.exists(!_.resolved) } // we don't want the gOutput to be taken as part of the expressions @@ -86,46 +86,24 @@ case class Generate( } case class Filter(condition: Expression, child: LogicalPlan) extends UnaryNode { - /** - * Indicates if `atLeastNNulls` is used to check if atLeastNNulls.children - * have at least one null value and atLeastNNulls.children are all attributes. - */ - private def isAtLeastOneNullOutputAttributes(atLeastNNulls: AtLeastNNulls): Boolean = { - val expressions = atLeastNNulls.children - val n = atLeastNNulls.n - if (n != 1) { - // AtLeastNNulls is not used to check if atLeastNNulls.children have - // at least one null value. - false - } else { - // AtLeastNNulls is used to check if atLeastNNulls.children have - // at least one null value. We need to make sure all atLeastNNulls.children - // are attributes. - expressions.forall(_.isInstanceOf[Attribute]) - } - } - - override def output: Seq[Attribute] = condition match { - case Not(a: AtLeastNNulls) if isAtLeastOneNullOutputAttributes(a) => - // The condition is used to make sure that there is no null value in - // a.children. - val nonNullableAttributes = AttributeSet(a.children.asInstanceOf[Seq[Attribute]]) - child.output.map { - case attr if nonNullableAttributes.contains(attr) => - attr.withNullability(false) - case attr => attr - } - case _ => child.output - } + override def output: Seq[Attribute] = child.output } -case class Union(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { +abstract class SetOperation(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { // TODO: These aren't really the same attributes as nullability etc might change. - override def output: Seq[Attribute] = left.output + final override def output: Seq[Attribute] = left.output - override lazy val resolved: Boolean = + final override lazy val resolved: Boolean = childrenResolved && - left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType } + left.output.length == right.output.length && + left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType } +} + +private[sql] object SetOperation { + def unapply(p: SetOperation): Option[(LogicalPlan, LogicalPlan)] = Some((p.left, p.right)) +} + +case class Union(left: LogicalPlan, right: LogicalPlan) extends SetOperation(left, right) { override def statistics: Statistics = { val sizeInBytes = left.statistics.sizeInBytes + right.statistics.sizeInBytes @@ -133,6 +111,10 @@ case class Union(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { } } +case class Intersect(left: LogicalPlan, right: LogicalPlan) extends SetOperation(left, right) + +case class Except(left: LogicalPlan, right: LogicalPlan) extends SetOperation(left, right) + case class Join( left: LogicalPlan, right: LogicalPlan, @@ -172,15 +154,6 @@ case class BroadcastHint(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output } - -case class Except(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { - override def output: Seq[Attribute] = left.output - - override lazy val resolved: Boolean = - childrenResolved && - left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType } -} - case class InsertIntoTable( table: LogicalPlan, partition: Map[String, Option[String]], @@ -190,7 +163,7 @@ case class InsertIntoTable( extends LogicalPlan { override def children: Seq[LogicalPlan] = child :: Nil - override def output: Seq[Attribute] = child.output + override def output: Seq[Attribute] = Seq.empty assert(overwrite || !ifNotExists) override lazy val resolved: Boolean = childrenResolved && child.output.zip(table.output).forall { @@ -218,7 +191,7 @@ case class WithWindowDefinition( } /** - * @param order The ordering expressions, should all be [[AttributeReference]] + * @param order The ordering expressions * @param global True means global sorting apply for entire data set, * False means sorting only apply within the partition. * @param child Child logical plan @@ -228,11 +201,6 @@ case class Sort( global: Boolean, child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output - - def hasNoEvaluation: Boolean = order.forall(_.child.isInstanceOf[AttributeReference]) - - override lazy val resolved: Boolean = - expressions.forall(_.resolved) && childrenResolved && hasNoEvaluation } case class Aggregate( @@ -247,7 +215,7 @@ case class Aggregate( }.nonEmpty ) - expressions.forall(_.resolved) && childrenResolved && !hasWindowExpressions + !expressions.exists(!_.resolved) && childrenResolved && !hasWindowExpressions } lazy val newAggregation: Option[Aggregate] = Utils.tryConvert(this) @@ -263,7 +231,7 @@ case class Window( child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = - (projectList ++ windowExpressions).map(_.toAttribute) + projectList ++ windowExpressions.map(_.toAttribute) } /** @@ -475,10 +443,3 @@ case object OneRowRelation extends LeafNode { override def statistics: Statistics = Statistics(sizeInBytes = 1) } -case class Intersect(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { - override def output: Seq[Attribute] = left.output - - override lazy val resolved: Boolean = - childrenResolved && - left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType } -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala index ec659ce789c2..5ac3f1f5b0ca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala @@ -75,6 +75,37 @@ case class OrderedDistribution(ordering: Seq[SortOrder]) extends Distribution { def clustering: Set[Expression] = ordering.map(_.child).toSet } +/** + * Describes how an operator's output is split across partitions. The `compatibleWith`, + * `guarantees`, and `satisfies` methods describe relationships between child partitionings, + * target partitionings, and [[Distribution]]s. These relations are described more precisely in + * their individual method docs, but at a high level: + * + * - `satisfies` is a relationship between partitionings and distributions. + * - `compatibleWith` is relationships between an operator's child output partitionings. + * - `guarantees` is a relationship between a child's existing output partitioning and a target + * output partitioning. + * + * Diagrammatically: + * + * +--------------+ + * | Distribution | + * +--------------+ + * ^ + * | + * satisfies + * | + * +--------------+ +--------------+ + * | Child | | Target | + * +----| Partitioning |----guarantees--->| Partitioning | + * | +--------------+ +--------------+ + * | ^ + * | | + * | compatibleWith + * | | + * +------------+ + * + */ sealed trait Partitioning { /** Returns the number of partitions that the data is split across */ val numPartitions: Int @@ -90,9 +121,66 @@ sealed trait Partitioning { /** * Returns true iff we can say that the partitioning scheme of this [[Partitioning]] * guarantees the same partitioning scheme described by `other`. + * + * Compatibility of partitionings is only checked for operators that have multiple children + * and that require a specific child output [[Distribution]], such as joins. + * + * Intuitively, partitionings are compatible if they route the same partitioning key to the same + * partition. For instance, two hash partitionings are only compatible if they produce the same + * number of output partitionings and hash records according to the same hash function and + * same partitioning key schema. + * + * Put another way, two partitionings are compatible with each other if they satisfy all of the + * same distribution guarantees. */ - // TODO: Add an example once we have the `nullSafe` concept. - def guarantees(other: Partitioning): Boolean + def compatibleWith(other: Partitioning): Boolean + + /** + * Returns true iff we can say that the partitioning scheme of this [[Partitioning]] guarantees + * the same partitioning scheme described by `other`. If a `A.guarantees(B)`, then repartitioning + * the child's output according to `B` will be unnecessary. `guarantees` is used as a performance + * optimization to allow the exchange planner to avoid redundant repartitionings. By default, + * a partitioning only guarantees partitionings that are equal to itself (i.e. the same number + * of partitions, same strategy (range or hash), etc). + * + * In order to enable more aggressive optimization, this strict equality check can be relaxed. + * For example, say that the planner needs to repartition all of an operator's children so that + * they satisfy the [[AllTuples]] distribution. One way to do this is to repartition all children + * to have the [[SinglePartition]] partitioning. If one of the operator's children already happens + * to be hash-partitioned with a single partition then we do not need to re-shuffle this child; + * this repartitioning can be avoided if a single-partition [[HashPartitioning]] `guarantees` + * [[SinglePartition]]. + * + * The SinglePartition example given above is not particularly interesting; guarantees' real + * value occurs for more advanced partitioning strategies. SPARK-7871 will introduce a notion + * of null-safe partitionings, under which partitionings can specify whether rows whose + * partitioning keys contain null values will be grouped into the same partition or whether they + * will have an unknown / random distribution. If a partitioning does not require nulls to be + * clustered then a partitioning which _does_ cluster nulls will guarantee the null clustered + * partitioning. The converse is not true, however: a partitioning which clusters nulls cannot + * be guaranteed by one which does not cluster them. Thus, in general `guarantees` is not a + * symmetric relation. + * + * Another way to think about `guarantees`: if `A.guarantees(B)`, then any partitioning of rows + * produced by `A` could have also been produced by `B`. + */ + def guarantees(other: Partitioning): Boolean = this == other +} + +object Partitioning { + def allCompatible(partitionings: Seq[Partitioning]): Boolean = { + // Note: this assumes transitivity + partitionings.sliding(2).map { + case Seq(a) => true + case Seq(a, b) => + if (a.numPartitions != b.numPartitions) { + assert(!a.compatibleWith(b) && !b.compatibleWith(a)) + false + } else { + a.compatibleWith(b) && b.compatibleWith(a) + } + }.forall(_ == true) + } } case class UnknownPartitioning(numPartitions: Int) extends Partitioning { @@ -101,6 +189,8 @@ case class UnknownPartitioning(numPartitions: Int) extends Partitioning { case _ => false } + override def compatibleWith(other: Partitioning): Boolean = false + override def guarantees(other: Partitioning): Boolean = false } @@ -109,21 +199,9 @@ case object SinglePartition extends Partitioning { override def satisfies(required: Distribution): Boolean = true - override def guarantees(other: Partitioning): Boolean = other match { - case SinglePartition => true - case _ => false - } -} - -case object BroadcastPartitioning extends Partitioning { - val numPartitions = 1 - - override def satisfies(required: Distribution): Boolean = true + override def compatibleWith(other: Partitioning): Boolean = other.numPartitions == 1 - override def guarantees(other: Partitioning): Boolean = other match { - case BroadcastPartitioning => true - case _ => false - } + override def guarantees(other: Partitioning): Boolean = other.numPartitions == 1 } /** @@ -138,20 +216,23 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int) override def nullable: Boolean = false override def dataType: DataType = IntegerType - lazy val clusteringSet = expressions.toSet - override def satisfies(required: Distribution): Boolean = required match { case UnspecifiedDistribution => true case ClusteredDistribution(requiredClustering) => - clusteringSet.subsetOf(requiredClustering.toSet) + expressions.toSet.subsetOf(requiredClustering.toSet) + case _ => false + } + + override def compatibleWith(other: Partitioning): Boolean = other match { + case o: HashPartitioning => this == o case _ => false } override def guarantees(other: Partitioning): Boolean = other match { - case o: HashPartitioning => - this.clusteringSet == o.clusteringSet && this.numPartitions == o.numPartitions + case o: HashPartitioning => this == o case _ => false } + } /** @@ -173,15 +254,18 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int) override def nullable: Boolean = false override def dataType: DataType = IntegerType - private[this] lazy val clusteringSet = ordering.map(_.child).toSet - override def satisfies(required: Distribution): Boolean = required match { case UnspecifiedDistribution => true case OrderedDistribution(requiredOrdering) => val minSize = Seq(requiredOrdering.size, ordering.size).min requiredOrdering.take(minSize) == ordering.take(minSize) case ClusteredDistribution(requiredClustering) => - clusteringSet.subsetOf(requiredClustering.toSet) + ordering.map(_.child).toSet.subsetOf(requiredClustering.toSet) + case _ => false + } + + override def compatibleWith(other: Partitioning): Boolean = other match { + case o: RangePartitioning => this == o case _ => false } @@ -228,6 +312,13 @@ case class PartitioningCollection(partitionings: Seq[Partitioning]) override def satisfies(required: Distribution): Boolean = partitionings.exists(_.satisfies(required)) + /** + * Returns true if any `partitioning` of this collection is compatible with + * the given [[Partitioning]]. + */ + override def compatibleWith(other: Partitioning): Boolean = + partitionings.exists(_.compatibleWith(other)) + /** * Returns true if any `partitioning` of this collection guarantees * the given [[Partitioning]]. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala index 3f9858b0c4a4..f80d2a93241d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala @@ -17,10 +17,30 @@ package org.apache.spark.sql.catalyst.rules +import scala.collection.JavaConverters._ + +import com.google.common.util.concurrent.AtomicLongMap + import org.apache.spark.Logging import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.catalyst.util.sideBySide +object RuleExecutor { + protected val timeMap = AtomicLongMap.create[String]() + + /** Resets statistics about time spent running specific rules */ + def resetTime(): Unit = timeMap.clear() + + /** Dump statistics about time spent running specific rules. */ + def dumpTimeSpent(): String = { + val map = timeMap.asMap().asScala + val maxSize = map.keys.map(_.toString.length).max + map.toSeq.sortBy(_._2).reverseMap { case (k, v) => + s"${k.padTo(maxSize, " ").mkString} $v" + }.mkString("\n") + } +} + abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { /** @@ -41,6 +61,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { /** Defines a sequence of rule batches, to be overridden by the implementation. */ protected val batches: Seq[Batch] + /** * Executes the batches of rules defined by the subclass. The batches are executed serially * using the defined execution strategy. Within each batch, rules are also executed serially. @@ -58,7 +79,11 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { while (continue) { curPlan = batch.rules.foldLeft(curPlan) { case (plan, rule) => + val startTime = System.nanoTime() val result = rule(plan) + val runTime = System.nanoTime() - startTime + RuleExecutor.timeMap.addAndGet(rule.ruleName, runTime) + if (!result.fastEquals(plan)) { logTrace( s""" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 122e9fc5ed77..7971e25188e8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -149,7 +149,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { /** * Returns a copy of this node where `f` has been applied to all the nodes children. */ - def mapChildren(f: BaseType => BaseType): this.type = { + def mapChildren(f: BaseType => BaseType): BaseType = { var changed = false val newArgs = productIterator.map { case arg: TreeNode[_] if containsChild(arg) => @@ -170,7 +170,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { * Returns a copy of this node with the children replaced. * TODO: Validate somewhere (in debug mode?) that children are ordered correctly. */ - def withNewChildren(newChildren: Seq[BaseType]): this.type = { + def withNewChildren(newChildren: Seq[BaseType]): BaseType = { assert(newChildren.size == children.size, "Incorrect number of children") var changed = false val remainingNewChildren = newChildren.toBuffer @@ -229,9 +229,9 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { // Check if unchanged and then possibly return old copy to avoid gc churn. if (this fastEquals afterRule) { - transformChildrenDown(rule) + transformChildren(rule, (t, r) => t.transformDown(r)) } else { - afterRule.transformChildrenDown(rule) + afterRule.transformChildren(rule, (t, r) => t.transformDown(r)) } } @@ -240,11 +240,13 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { * this node. When `rule` does not apply to a given node it is left unchanged. * @param rule the function used to transform this nodes children */ - def transformChildrenDown(rule: PartialFunction[BaseType, BaseType]): this.type = { + protected def transformChildren( + rule: PartialFunction[BaseType, BaseType], + nextOperation: (BaseType, PartialFunction[BaseType, BaseType]) => BaseType): BaseType = { var changed = false val newArgs = productIterator.map { case arg: TreeNode[_] if containsChild(arg) => - val newChild = arg.asInstanceOf[BaseType].transformDown(rule) + val newChild = nextOperation(arg.asInstanceOf[BaseType], rule) if (!(newChild fastEquals arg)) { changed = true newChild @@ -252,7 +254,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { arg } case Some(arg: TreeNode[_]) if containsChild(arg) => - val newChild = arg.asInstanceOf[BaseType].transformDown(rule) + val newChild = nextOperation(arg.asInstanceOf[BaseType], rule) if (!(newChild fastEquals arg)) { changed = true Some(newChild) @@ -263,7 +265,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { case d: DataType => d // Avoid unpacking Structs case args: Traversable[_] => args.map { case arg: TreeNode[_] if containsChild(arg) => - val newChild = arg.asInstanceOf[BaseType].transformDown(rule) + val newChild = nextOperation(arg.asInstanceOf[BaseType], rule) if (!(newChild fastEquals arg)) { changed = true newChild @@ -285,7 +287,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { * @param rule the function use to transform this nodes children */ def transformUp(rule: PartialFunction[BaseType, BaseType]): BaseType = { - val afterRuleOnChildren = transformChildrenUp(rule) + val afterRuleOnChildren = transformChildren(rule, (t, r) => t.transformUp(r)) if (this fastEquals afterRuleOnChildren) { CurrentOrigin.withOrigin(origin) { rule.applyOrElse(this, identity[BaseType]) @@ -297,44 +299,6 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { } } - def transformChildrenUp(rule: PartialFunction[BaseType, BaseType]): this.type = { - var changed = false - val newArgs = productIterator.map { - case arg: TreeNode[_] if containsChild(arg) => - val newChild = arg.asInstanceOf[BaseType].transformUp(rule) - if (!(newChild fastEquals arg)) { - changed = true - newChild - } else { - arg - } - case Some(arg: TreeNode[_]) if containsChild(arg) => - val newChild = arg.asInstanceOf[BaseType].transformUp(rule) - if (!(newChild fastEquals arg)) { - changed = true - Some(newChild) - } else { - Some(arg) - } - case m: Map[_, _] => m - case d: DataType => d // Avoid unpacking Structs - case args: Traversable[_] => args.map { - case arg: TreeNode[_] if containsChild(arg) => - val newChild = arg.asInstanceOf[BaseType].transformUp(rule) - if (!(newChild fastEquals arg)) { - changed = true - newChild - } else { - arg - } - case other => other - } - case nonChild: AnyRef => nonChild - case null => null - }.toArray - if (changed) makeCopy(newArgs) else this - } - /** * Args to the constructor that should be copied, but not transformed. * These are appended to the transformed args automatically by makeCopy @@ -348,7 +312,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { * that are not present in the productIterator. * @param newArgs the new product arguments. */ - def makeCopy(newArgs: Array[AnyRef]): this.type = attachTree(this, "makeCopy") { + def makeCopy(newArgs: Array[AnyRef]): BaseType = attachTree(this, "makeCopy") { val ctors = getClass.getConstructors.filter(_.getParameterTypes.size != 0) if (ctors.isEmpty) { sys.error(s"No valid constructor for $nodeName") @@ -359,9 +323,9 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { CurrentOrigin.withOrigin(origin) { // Skip no-arg constructors that are just there for kryo. if (otherCopyArgs.isEmpty) { - defaultCtor.newInstance(newArgs: _*).asInstanceOf[this.type] + defaultCtor.newInstance(newArgs: _*).asInstanceOf[BaseType] } else { - defaultCtor.newInstance((newArgs ++ otherCopyArgs).toArray: _*).asInstanceOf[this.type] + defaultCtor.newInstance((newArgs ++ otherCopyArgs).toArray: _*).asInstanceOf[BaseType] } } } catch { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index f645eb5f7bb0..687ca000d12b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -31,11 +31,18 @@ import org.apache.spark.unsafe.types.UTF8String * precision. */ object DateTimeUtils { + + // we use Int and Long internally to represent [[DateType]] and [[TimestampType]] + type SQLDate = Int + type SQLTimestamp = Long + // see http://stackoverflow.com/questions/466321/convert-unix-timestamp-to-julian - final val JULIAN_DAY_OF_EPOCH = 2440587 // and .5 + // it's 2440587.5, rounding up to compatible with Hive + final val JULIAN_DAY_OF_EPOCH = 2440588 final val SECONDS_PER_DAY = 60 * 60 * 24L final val MICROS_PER_SECOND = 1000L * 1000L final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L + final val MICROS_PER_DAY = MICROS_PER_SECOND * SECONDS_PER_DAY final val MILLIS_PER_DAY = SECONDS_PER_DAY * 1000L @@ -72,7 +79,7 @@ object DateTimeUtils { } // we should use the exact day as Int, for example, (year, month, day) -> day - def millisToDays(millisUtc: Long): Int = { + def millisToDays(millisUtc: Long): SQLDate = { // SPARK-6785: use Math.floor so negative number of days (dates before 1970) // will correctly work as input for function toJavaDate(Int) val millisLocal = millisUtc + threadLocalLocalTimeZone.get().getOffset(millisUtc) @@ -80,16 +87,16 @@ object DateTimeUtils { } // reverse of millisToDays - def daysToMillis(days: Int): Long = { + def daysToMillis(days: SQLDate): Long = { val millisUtc = days.toLong * MILLIS_PER_DAY millisUtc - threadLocalLocalTimeZone.get().getOffset(millisUtc) } - def dateToString(days: Int): String = + def dateToString(days: SQLDate): String = threadLocalDateFormat.get.format(toJavaDate(days)) // Converts Timestamp to string according to Hive TimestampWritable convention. - def timestampToString(us: Long): String = { + def timestampToString(us: SQLTimestamp): String = { val ts = toJavaTimestamp(us) val timestampString = ts.toString val formatted = threadLocalTimestampFormat.get.format(ts) @@ -132,21 +139,21 @@ object DateTimeUtils { /** * Returns the number of days since epoch from from java.sql.Date. */ - def fromJavaDate(date: Date): Int = { + def fromJavaDate(date: Date): SQLDate = { millisToDays(date.getTime) } /** * Returns a java.sql.Date from number of days since epoch. */ - def toJavaDate(daysSinceEpoch: Int): Date = { + def toJavaDate(daysSinceEpoch: SQLDate): Date = { new Date(daysToMillis(daysSinceEpoch)) } /** * Returns a java.sql.Timestamp from number of micros since epoch. */ - def toJavaTimestamp(us: Long): Timestamp = { + def toJavaTimestamp(us: SQLTimestamp): Timestamp = { // setNanos() will overwrite the millisecond part, so the milliseconds should be // cut off at seconds var seconds = us / MICROS_PER_SECOND @@ -164,7 +171,7 @@ object DateTimeUtils { /** * Returns the number of micros since epoch from java.sql.Timestamp. */ - def fromJavaTimestamp(t: Timestamp): Long = { + def fromJavaTimestamp(t: Timestamp): SQLTimestamp = { if (t != null) { t.getTime() * 1000L + (t.getNanos().toLong / 1000) % 1000L } else { @@ -176,21 +183,22 @@ object DateTimeUtils { * Returns the number of microseconds since epoch from Julian day * and nanoseconds in a day */ - def fromJulianDay(day: Int, nanoseconds: Long): Long = { + def fromJulianDay(day: Int, nanoseconds: Long): SQLTimestamp = { // use Long to avoid rounding errors - val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY - SECONDS_PER_DAY / 2 + val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY seconds * MICROS_PER_SECOND + nanoseconds / 1000L } /** * Returns Julian day and nanoseconds in a day from the number of microseconds + * + * Note: support timestamp since 4717 BC (without negative nanoseconds, compatible with Hive). */ - def toJulianDay(us: Long): (Int, Long) = { - val seconds = us / MICROS_PER_SECOND + SECONDS_PER_DAY / 2 - val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH - val secondsInDay = seconds % SECONDS_PER_DAY - val nanos = (us % MICROS_PER_SECOND) * 1000L - (day.toInt, secondsInDay * NANOS_PER_SECOND + nanos) + def toJulianDay(us: SQLTimestamp): (Int, Long) = { + val julian_us = us + JULIAN_DAY_OF_EPOCH * MICROS_PER_DAY + val day = julian_us / MICROS_PER_DAY + val micros = julian_us % MICROS_PER_DAY + (day.toInt, micros * 1000L) } /** @@ -219,7 +227,7 @@ object DateTimeUtils { * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m` * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m` */ - def stringToTimestamp(s: UTF8String): Option[Long] = { + def stringToTimestamp(s: UTF8String): Option[SQLTimestamp] = { if (s == null) { return None } @@ -355,7 +363,7 @@ object DateTimeUtils { * `yyyy-[m]m-[d]d *` * `yyyy-[m]m-[d]dT*` */ - def stringToDate(s: UTF8String): Option[Int] = { + def stringToDate(s: UTF8String): Option[SQLDate] = { if (s == null) { return None } @@ -394,7 +402,7 @@ object DateTimeUtils { /** * Returns the hour value of a given timestamp value. The timestamp is expressed in microseconds. */ - def getHours(timestamp: Long): Int = { + def getHours(timestamp: SQLTimestamp): Int = { val localTs = (timestamp / 1000) + defaultTimeZone.getOffset(timestamp / 1000) ((localTs / 1000 / 3600) % 24).toInt } @@ -403,7 +411,7 @@ object DateTimeUtils { * Returns the minute value of a given timestamp value. The timestamp is expressed in * microseconds. */ - def getMinutes(timestamp: Long): Int = { + def getMinutes(timestamp: SQLTimestamp): Int = { val localTs = (timestamp / 1000) + defaultTimeZone.getOffset(timestamp / 1000) ((localTs / 1000 / 60) % 60).toInt } @@ -412,7 +420,7 @@ object DateTimeUtils { * Returns the second value of a given timestamp value. The timestamp is expressed in * microseconds. */ - def getSeconds(timestamp: Long): Int = { + def getSeconds(timestamp: SQLTimestamp): Int = { ((timestamp / 1000 / 1000) % 60).toInt } @@ -447,7 +455,7 @@ object DateTimeUtils { * The calculation uses the fact that the period 1.1.2001 until 31.12.2400 is * equals to the period 1.1.1601 until 31.12.2000. */ - private[this] def getYearAndDayInYear(daysSince1970: Int): (Int, Int) = { + private[this] def getYearAndDayInYear(daysSince1970: SQLDate): (Int, Int) = { // add the difference (in days) between 1.1.1970 and the artificial year 0 (-17999) val daysNormalized = daysSince1970 + toYearZero val numOfQuarterCenturies = daysNormalized / daysIn400Years @@ -461,7 +469,7 @@ object DateTimeUtils { * Returns the 'day in year' value for the given date. The date is expressed in days * since 1.1.1970. */ - def getDayInYear(date: Int): Int = { + def getDayInYear(date: SQLDate): Int = { getYearAndDayInYear(date)._2 } @@ -469,7 +477,7 @@ object DateTimeUtils { * Returns the year value for the given date. The date is expressed in days * since 1.1.1970. */ - def getYear(date: Int): Int = { + def getYear(date: SQLDate): Int = { getYearAndDayInYear(date)._1 } @@ -477,7 +485,7 @@ object DateTimeUtils { * Returns the quarter for the given date. The date is expressed in days * since 1.1.1970. */ - def getQuarter(date: Int): Int = { + def getQuarter(date: SQLDate): Int = { var (year, dayInYear) = getYearAndDayInYear(date) if (isLeapYear(year)) { dayInYear = dayInYear - 1 @@ -493,11 +501,55 @@ object DateTimeUtils { } } + /** + * Split date (expressed in days since 1.1.1970) into four fields: + * year, month (Jan is Month 1), dayInMonth, daysToMonthEnd (0 if it's last day of month). + */ + def splitDate(date: SQLDate): (Int, Int, Int, Int) = { + var (year, dayInYear) = getYearAndDayInYear(date) + val isLeap = isLeapYear(year) + if (isLeap && dayInYear == 60) { + (year, 2, 29, 0) + } else { + if (isLeap && dayInYear > 60) dayInYear -= 1 + + if (dayInYear <= 181) { + if (dayInYear <= 31) { + (year, 1, dayInYear, 31 - dayInYear) + } else if (dayInYear <= 59) { + (year, 2, dayInYear - 31, if (isLeap) 60 - dayInYear else 59 - dayInYear) + } else if (dayInYear <= 90) { + (year, 3, dayInYear - 59, 90 - dayInYear) + } else if (dayInYear <= 120) { + (year, 4, dayInYear - 90, 120 - dayInYear) + } else if (dayInYear <= 151) { + (year, 5, dayInYear - 120, 151 - dayInYear) + } else { + (year, 6, dayInYear - 151, 181 - dayInYear) + } + } else { + if (dayInYear <= 212) { + (year, 7, dayInYear - 181, 212 - dayInYear) + } else if (dayInYear <= 243) { + (year, 8, dayInYear - 212, 243 - dayInYear) + } else if (dayInYear <= 273) { + (year, 9, dayInYear - 243, 273 - dayInYear) + } else if (dayInYear <= 304) { + (year, 10, dayInYear - 273, 304 - dayInYear) + } else if (dayInYear <= 334) { + (year, 11, dayInYear - 304, 334 - dayInYear) + } else { + (year, 12, dayInYear - 334, 365 - dayInYear) + } + } + } + } + /** * Returns the month value for the given date. The date is expressed in days * since 1.1.1970. January is month 1. */ - def getMonth(date: Int): Int = { + def getMonth(date: SQLDate): Int = { var (year, dayInYear) = getYearAndDayInYear(date) if (isLeapYear(year)) { if (dayInYear == 60) { @@ -538,7 +590,7 @@ object DateTimeUtils { * Returns the 'day of month' value for the given date. The date is expressed in days * since 1.1.1970. */ - def getDayOfMonth(date: Int): Int = { + def getDayOfMonth(date: SQLDate): Int = { var (year, dayInYear) = getYearAndDayInYear(date) if (isLeapYear(year)) { if (dayInYear == 60) { @@ -584,7 +636,7 @@ object DateTimeUtils { * Returns the date value for the first day of the given month. * The month is expressed in months since year zero (17999 BC), starting from 0. */ - private def firstDayOfMonth(absoluteMonth: Int): Int = { + private def firstDayOfMonth(absoluteMonth: Int): SQLDate = { val absoluteYear = absoluteMonth / 12 var monthInYear = absoluteMonth - absoluteYear * 12 var date = getDateFromYear(absoluteYear) @@ -602,7 +654,7 @@ object DateTimeUtils { * Returns the date value for January 1 of the given year. * The year is expressed in years since year zero (17999 BC), starting from 0. */ - private def getDateFromYear(absoluteYear: Int): Int = { + private def getDateFromYear(absoluteYear: Int): SQLDate = { val absoluteDays = (absoluteYear * 365 + absoluteYear / 400 - absoluteYear / 100 + absoluteYear / 4) absoluteDays - toYearZero @@ -612,16 +664,17 @@ object DateTimeUtils { * Add date and year-month interval. * Returns a date value, expressed in days since 1.1.1970. */ - def dateAddMonths(days: Int, months: Int): Int = { - val absoluteMonth = (getYear(days) - YearZero) * 12 + getMonth(days) - 1 + months + def dateAddMonths(days: SQLDate, months: Int): SQLDate = { + val (year, monthInYear, dayOfMonth, daysToMonthEnd) = splitDate(days) + val absoluteMonth = (year - YearZero) * 12 + monthInYear - 1 + months val nonNegativeMonth = if (absoluteMonth >= 0) absoluteMonth else 0 val currentMonthInYear = nonNegativeMonth % 12 val currentYear = nonNegativeMonth / 12 + val leapDay = if (currentMonthInYear == 1 && isLeapYear(currentYear + YearZero)) 1 else 0 val lastDayOfMonth = monthDays(currentMonthInYear) + leapDay - val dayOfMonth = getDayOfMonth(days) - val currentDayInMonth = if (getDayOfMonth(days + 1) == 1 || dayOfMonth >= lastDayOfMonth) { + val currentDayInMonth = if (daysToMonthEnd == 0 || dayOfMonth >= lastDayOfMonth) { // last day of the month lastDayOfMonth } else { @@ -634,52 +687,12 @@ object DateTimeUtils { * Add timestamp and full interval. * Returns a timestamp value, expressed in microseconds since 1.1.1970 00:00:00. */ - def timestampAddInterval(start: Long, months: Int, microseconds: Long): Long = { + def timestampAddInterval(start: SQLTimestamp, months: Int, microseconds: Long): SQLTimestamp = { val days = millisToDays(start / 1000L) val newDays = dateAddMonths(days, months) daysToMillis(newDays) * 1000L + start - daysToMillis(days) * 1000L + microseconds } - /** - * Returns the last dayInMonth in the month it belongs to. The date is expressed - * in days since 1.1.1970. the return value starts from 1. - */ - private def getLastDayInMonthOfMonth(date: Int): Int = { - var (year, dayInYear) = getYearAndDayInYear(date) - if (isLeapYear(year)) { - if (dayInYear > 31 && dayInYear <= 60) { - return 29 - } else if (dayInYear > 60) { - dayInYear = dayInYear - 1 - } - } - if (dayInYear <= 31) { - 31 - } else if (dayInYear <= 59) { - 28 - } else if (dayInYear <= 90) { - 31 - } else if (dayInYear <= 120) { - 30 - } else if (dayInYear <= 151) { - 31 - } else if (dayInYear <= 181) { - 30 - } else if (dayInYear <= 212) { - 31 - } else if (dayInYear <= 243) { - 31 - } else if (dayInYear <= 273) { - 30 - } else if (dayInYear <= 304) { - 31 - } else if (dayInYear <= 334) { - 30 - } else { - 31 - } - } - /** * Returns number of months between time1 and time2. time1 and time2 are expressed in * microseconds since 1.1.1970. @@ -690,19 +703,18 @@ object DateTimeUtils { * Otherwise, the difference is calculated based on 31 days per month, and rounding to * 8 digits. */ - def monthsBetween(time1: Long, time2: Long): Double = { + def monthsBetween(time1: SQLTimestamp, time2: SQLTimestamp): Double = { val millis1 = time1 / 1000L val millis2 = time2 / 1000L val date1 = millisToDays(millis1) val date2 = millisToDays(millis2) - // TODO(davies): get year, month, dayOfMonth from single function - val dayInMonth1 = getDayOfMonth(date1) - val dayInMonth2 = getDayOfMonth(date2) - val months1 = getYear(date1) * 12 + getMonth(date1) - val months2 = getYear(date2) * 12 + getMonth(date2) - - if (dayInMonth1 == dayInMonth2 || (dayInMonth1 == getLastDayInMonthOfMonth(date1) - && dayInMonth2 == getLastDayInMonthOfMonth(date2))) { + val (year1, monthInYear1, dayInMonth1, daysToMonthEnd1) = splitDate(date1) + val (year2, monthInYear2, dayInMonth2, daysToMonthEnd2) = splitDate(date2) + + val months1 = year1 * 12 + monthInYear1 + val months2 = year2 * 12 + monthInYear2 + + if (dayInMonth1 == dayInMonth2 || ((daysToMonthEnd1 == 0) && (daysToMonthEnd2 == 0))) { return (months1 - months2).toDouble } // milliseconds is enough for 8 digits precision on the right side @@ -736,7 +748,7 @@ object DateTimeUtils { * Returns the first date which is later than startDate and is of the given dayOfWeek. * dayOfWeek is an integer ranges in [0, 6], and 0 is Thu, 1 is Fri, etc,. */ - def getNextDateForDayOfWeek(startDate: Int, dayOfWeek: Int): Int = { + def getNextDateForDayOfWeek(startDate: SQLDate, dayOfWeek: Int): SQLDate = { startDate + 1 + ((dayOfWeek - 1 - startDate) % 7 + 7) % 7 } @@ -744,41 +756,9 @@ object DateTimeUtils { * Returns last day of the month for the given date. The date is expressed in days * since 1.1.1970. */ - def getLastDayOfMonth(date: Int): Int = { - var (year, dayInYear) = getYearAndDayInYear(date) - if (isLeapYear(year)) { - if (dayInYear > 31 && dayInYear <= 60) { - return date + (60 - dayInYear) - } else if (dayInYear > 60) { - dayInYear = dayInYear - 1 - } - } - val lastDayOfMonthInYear = if (dayInYear <= 31) { - 31 - } else if (dayInYear <= 59) { - 59 - } else if (dayInYear <= 90) { - 90 - } else if (dayInYear <= 120) { - 120 - } else if (dayInYear <= 151) { - 151 - } else if (dayInYear <= 181) { - 181 - } else if (dayInYear <= 212) { - 212 - } else if (dayInYear <= 243) { - 243 - } else if (dayInYear <= 273) { - 273 - } else if (dayInYear <= 304) { - 304 - } else if (dayInYear <= 334) { - 334 - } else { - 365 - } - date + (lastDayOfMonthInYear - dayInYear) + def getLastDayOfMonth(date: SQLDate): SQLDate = { + val (_, _, _, daysToMonthEnd) = splitDate(date) + date + daysToMonthEnd } private val TRUNC_TO_YEAR = 1 @@ -789,7 +769,7 @@ object DateTimeUtils { * Returns the trunc date from original date and trunc level. * Trunc level should be generated using `parseTruncLevel()`, should only be 1 or 2. */ - def truncDate(d: Int, level: Int): Int = { + def truncDate(d: SQLDate, level: Int): SQLDate = { if (level == TRUNC_TO_YEAR) { d - DateTimeUtils.getDayInYear(d) + 1 } else if (level == TRUNC_TO_MONTH) { @@ -820,7 +800,7 @@ object DateTimeUtils { * Returns a timestamp of given timezone from utc timestamp, with the same string * representation in their timezone. */ - def fromUTCTime(time: Long, timeZone: String): Long = { + def fromUTCTime(time: SQLTimestamp, timeZone: String): SQLTimestamp = { val tz = TimeZone.getTimeZone(timeZone) val offset = tz.getOffset(time / 1000L) time + offset * 1000L @@ -830,7 +810,7 @@ object DateTimeUtils { * Returns a utc timestamp from a given timestamp from a given timezone, with the same * string representation in their timezone. */ - def toUTCTime(time: Long, timeZone: String): Long = { + def toUTCTime(time: SQLTimestamp, timeZone: String): SQLTimestamp = { val tz = TimeZone.getTimeZone(timeZone) val offset = tz.getOffset(time / 1000L) time - offset * 1000L diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala index 0b41f92c6193..bcf4d78fb937 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala @@ -54,10 +54,10 @@ object TypeUtils { def getNumeric(t: DataType): Numeric[Any] = t.asInstanceOf[NumericType].numeric.asInstanceOf[Numeric[Any]] - def getOrdering(t: DataType): Ordering[Any] = { + def getInterpretedOrdering(t: DataType): Ordering[Any] = { t match { case i: AtomicType => i.ordering.asInstanceOf[Ordering[Any]] - case s: StructType => s.ordering.asInstanceOf[Ordering[Any]] + case s: StructType => s.interpretedOrdering.asInstanceOf[Ordering[Any]] } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala index 5094058164b2..5770f59b5307 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala @@ -75,6 +75,10 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT override def simpleString: String = s"array<${elementType.simpleString}>" - private[spark] override def asNullable: ArrayType = + override private[spark] def asNullable: ArrayType = ArrayType(elementType.asNullable, containsNull = true) + + override private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = { + f(this) || elementType.existsRecursively(f) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala index f4428c2e8b20..7bcd623b3f33 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -77,6 +77,11 @@ abstract class DataType extends AbstractDataType { */ private[spark] def asNullable: DataType + /** + * Returns true if any `DataType` of this DataType tree satisfies the given function `f`. + */ + private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = f(this) + override private[sql] def defaultConcreteType: DataType = this override private[sql] def acceptsType(other: DataType): Boolean = sameType(other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index c0155eeb450a..c988f1d1b972 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.types +import java.math.{RoundingMode, MathContext} + import org.apache.spark.annotation.DeveloperApi /** @@ -28,7 +30,7 @@ import org.apache.spark.annotation.DeveloperApi * - Otherwise, the decimal value is longVal / (10 ** _scale) */ final class Decimal extends Ordered[Decimal] with Serializable { - import org.apache.spark.sql.types.Decimal.{BIG_DEC_ZERO, MAX_LONG_DIGITS, POW_10, ROUNDING_MODE} + import org.apache.spark.sql.types.Decimal._ private var decimalVal: BigDecimal = null private var longVal: Long = 0L @@ -261,29 +263,44 @@ final class Decimal extends Ordered[Decimal] with Serializable { def isZero: Boolean = if (decimalVal.ne(null)) decimalVal == BIG_DEC_ZERO else longVal == 0 - def + (that: Decimal): Decimal = Decimal(toBigDecimal + that.toBigDecimal) + def + (that: Decimal): Decimal = { + if (decimalVal.eq(null) && that.decimalVal.eq(null) && scale == that.scale) { + Decimal(longVal + that.longVal, Math.max(precision, that.precision), scale) + } else { + Decimal(toBigDecimal + that.toBigDecimal) + } + } - def - (that: Decimal): Decimal = Decimal(toBigDecimal - that.toBigDecimal) + def - (that: Decimal): Decimal = { + if (decimalVal.eq(null) && that.decimalVal.eq(null) && scale == that.scale) { + Decimal(longVal - that.longVal, Math.max(precision, that.precision), scale) + } else { + Decimal(toBigDecimal - that.toBigDecimal) + } + } - def * (that: Decimal): Decimal = Decimal(toBigDecimal * that.toBigDecimal) + // HiveTypeCoercion will take care of the precision, scale of result + def * (that: Decimal): Decimal = + Decimal(toJavaBigDecimal.multiply(that.toJavaBigDecimal, MATH_CONTEXT)) def / (that: Decimal): Decimal = - if (that.isZero) null else Decimal(toBigDecimal / that.toBigDecimal) + if (that.isZero) null else Decimal(toJavaBigDecimal.divide(that.toJavaBigDecimal, MATH_CONTEXT)) def % (that: Decimal): Decimal = - if (that.isZero) null else Decimal(toBigDecimal % that.toBigDecimal) + if (that.isZero) null + else Decimal(toJavaBigDecimal.remainder(that.toJavaBigDecimal, MATH_CONTEXT)) def remainder(that: Decimal): Decimal = this % that def unary_- : Decimal = { if (decimalVal.ne(null)) { - Decimal(-decimalVal) + Decimal(-decimalVal, precision, scale) } else { Decimal(-longVal, precision, scale) } } - def abs: Decimal = if (this.compare(Decimal(0)) < 0) this.unary_- else this + def abs: Decimal = if (this.compare(Decimal.ZERO) < 0) this.unary_- else this } object Decimal { @@ -296,6 +313,11 @@ object Decimal { private val BIG_DEC_ZERO = BigDecimal(0) + private val MATH_CONTEXT = new MathContext(DecimalType.MAX_PRECISION, RoundingMode.HALF_UP) + + private[sql] val ZERO = Decimal(0) + private[sql] val ONE = Decimal(1) + def apply(value: Double): Decimal = new Decimal().set(value) def apply(value: Long): Decimal = new Decimal().set(value) @@ -309,6 +331,9 @@ object Decimal { def apply(value: BigDecimal, precision: Int, scale: Int): Decimal = new Decimal().set(value, precision, scale) + def apply(value: java.math.BigDecimal, precision: Int, scale: Int): Decimal = + new Decimal().set(value, precision, scale) + def apply(unscaled: Long, precision: Int, scale: Int): Decimal = new Decimal().set(unscaled, precision, scale) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala index b314acdfe364..459fcb6fc0ac 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala @@ -17,22 +17,33 @@ package org.apache.spark.sql.types -import scala.reflect.ClassTag +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} -import org.apache.spark.sql.catalyst.expressions.GenericSpecializedGetters - -class GenericArrayData(private[sql] val array: Array[Any]) - extends ArrayData with GenericSpecializedGetters { - - override def genericGet(ordinal: Int): Any = array(ordinal) +class GenericArrayData(private[sql] val array: Array[Any]) extends ArrayData { override def copy(): ArrayData = new GenericArrayData(array.clone()) - // todo: Array is invariant in scala, maybe use toSeq instead? - override def toArray[T: ClassTag](elementType: DataType): Array[T] = array.map(_.asInstanceOf[T]) - override def numElements(): Int = array.length + private def getAs[T](ordinal: Int) = array(ordinal).asInstanceOf[T] + override def isNullAt(ordinal: Int): Boolean = getAs[AnyRef](ordinal) eq null + override def get(ordinal: Int, elementType: DataType): AnyRef = getAs(ordinal) + override def getBoolean(ordinal: Int): Boolean = getAs(ordinal) + override def getByte(ordinal: Int): Byte = getAs(ordinal) + override def getShort(ordinal: Int): Short = getAs(ordinal) + override def getInt(ordinal: Int): Int = getAs(ordinal) + override def getLong(ordinal: Int): Long = getAs(ordinal) + override def getFloat(ordinal: Int): Float = getAs(ordinal) + override def getDouble(ordinal: Int): Double = getAs(ordinal) + override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = getAs(ordinal) + override def getUTF8String(ordinal: Int): UTF8String = getAs(ordinal) + override def getBinary(ordinal: Int): Array[Byte] = getAs(ordinal) + override def getInterval(ordinal: Int): CalendarInterval = getAs(ordinal) + override def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs(ordinal) + override def getArray(ordinal: Int): ArrayData = getAs(ordinal) + override def getMap(ordinal: Int): MapData = getAs(ordinal) + override def toString(): String = array.mkString("[", ",", "]") override def equals(o: Any): Boolean = { @@ -56,8 +67,8 @@ class GenericArrayData(private[sql] val array: Array[Any]) return false } if (!isNullAt(i)) { - val o1 = genericGet(i) - val o2 = other.genericGet(i) + val o1 = array(i) + val o2 = other.array(i) o1 match { case b1: Array[Byte] => if (!o2.isInstanceOf[Array[Byte]] || @@ -91,7 +102,7 @@ class GenericArrayData(private[sql] val array: Array[Any]) if (isNullAt(i)) { 0 } else { - genericGet(i) match { + array(i) match { case b: Boolean => if (b) 0 else 1 case b: Byte => b.toInt case s: Short => s.toInt diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala index ac34b642827c..00461e529ca0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala @@ -62,8 +62,12 @@ case class MapType( override def simpleString: String = s"map<${keyType.simpleString},${valueType.simpleString}>" - private[spark] override def asNullable: MapType = + override private[spark] def asNullable: MapType = MapType(keyType.asNullable, valueType.asNullable, valueContainsNull = true) + + override private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = { + f(this) || keyType.existsRecursively(f) || valueType.existsRecursively(f) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala index 6928707f7bf6..d8968ef80639 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala @@ -24,7 +24,7 @@ import org.json4s.JsonDSL._ import org.apache.spark.SparkException import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute, RowOrdering} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, InterpretedOrdering} /** @@ -292,7 +292,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru private[sql] def merge(that: StructType): StructType = StructType.merge(this, that).asInstanceOf[StructType] - private[spark] override def asNullable: StructType = { + override private[spark] def asNullable: StructType = { val newFields = fields.map { case StructField(name, dataType, nullable, metadata) => StructField(name, dataType.asNullable, nullable = true, metadata) @@ -301,7 +301,11 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru StructType(newFields) } - private[sql] val ordering = RowOrdering.forSchema(this.fields.map(_.dataType)) + override private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = { + f(this) || fields.exists(field => field.dataType.existsRecursively(f)) + } + + private[sql] val interpretedOrdering = InterpretedOrdering.forSchema(this.fields.map(_.dataType)) } object StructType extends AbstractDataType { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 11e0c120f407..4025cbcec101 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -23,6 +23,8 @@ import java.math.MathContext import scala.util.Random +import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval @@ -84,6 +86,7 @@ object RandomDataGenerator { * random data generator is defined for that data type. The generated values will use an external * representation of the data type; for example, the random generator for [[DateType]] will return * instances of [[java.sql.Date]] and the generator for [[StructType]] will return a [[Row]]. + * For a [[UserDefinedType]] for a class X, an instance of class X is returned. * * @param dataType the type to generate values for * @param nullable whether null values should be generated @@ -106,7 +109,22 @@ object RandomDataGenerator { }) case BooleanType => Some(() => rand.nextBoolean()) case DateType => Some(() => new java.sql.Date(rand.nextInt())) - case TimestampType => Some(() => new java.sql.Timestamp(rand.nextLong())) + case TimestampType => + val generator = + () => { + var milliseconds = rand.nextLong() % 253402329599999L + // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT + // for "0001-01-01 00:00:00.000000". We need to find a + // number that is greater or equals to this number as a valid timestamp value. + while (milliseconds < -62135740800000L) { + // 253402329599999L is the the number of milliseconds since + // January 1, 1970, 00:00:00 GMT for "9999-12-31 23:59:59.999999". + milliseconds = rand.nextLong() % 253402329599999L + } + // DateTimeUtils.toJavaTimestamp takes microsecond. + DateTimeUtils.toJavaTimestamp(milliseconds * 1000) + } + Some(generator) case CalendarIntervalType => Some(() => { val months = rand.nextInt(1000) val ns = rand.nextLong() @@ -159,6 +177,27 @@ object RandomDataGenerator { None } } + case udt: UserDefinedType[_] => { + val maybeSqlTypeGenerator = forType(udt.sqlType, nullable, seed) + // Because random data generator at here returns scala value, we need to + // convert it to catalyst value to call udt's deserialize. + val toCatalystType = CatalystTypeConverters.createToCatalystConverter(udt.sqlType) + + if (maybeSqlTypeGenerator.isDefined) { + val sqlTypeGenerator = maybeSqlTypeGenerator.get + val generator = () => { + val generatedScalaValue = sqlTypeGenerator.apply() + if (generatedScalaValue == null) { + null + } else { + udt.deserialize(toCatalystType(generatedScalaValue)) + } + } + Some(generator) + } else { + None + } + } case unsupportedType => None } // Handle nullability by wrapping the non-null value generator: diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala index df0f04563edc..03bb102c67fe 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala @@ -32,7 +32,9 @@ class CatalystTypeConvertersSuite extends SparkFunSuite { IntegerType, LongType, FloatType, - DoubleType) + DoubleType, + DecimalType.SYSTEM_DEFAULT, + DecimalType.USER_DEFAULT) test("null handling in rows") { val schema = StructType(simpleTypes.map(t => StructField(t.getClass.getName, t))) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala new file mode 100644 index 000000000000..5b802ccc637d --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.expressions.{InterpretedMutableProjection, Literal} +import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashPartitioning} + +class PartitioningSuite extends SparkFunSuite { + test("HashPartitioning compatibility should be sensitive to expression ordering (SPARK-9785)") { + val expressions = Seq(Literal(2), Literal(3)) + // Consider two HashPartitionings that have the same _set_ of hash expressions but which are + // created with different orderings of those expressions: + val partitioningA = HashPartitioning(expressions, 100) + val partitioningB = HashPartitioning(expressions.reverse, 100) + // These partitionings are not considered equal: + assert(partitioningA != partitioningB) + // However, they both satisfy the same clustered distribution: + val distribution = ClusteredDistribution(expressions) + assert(partitioningA.satisfies(distribution)) + assert(partitioningB.satisfies(distribution)) + // These partitionings compute different hashcodes for the same input row: + def computeHashCode(partitioning: HashPartitioning): Int = { + val hashExprProj = new InterpretedMutableProjection(partitioning.expressions, Seq.empty) + hashExprProj.apply(InternalRow.empty).hashCode() + } + assert(computeHashCode(partitioningA) != computeHashCode(partitioningB)) + // Thus, these partitionings are incompatible: + assert(!partitioningA.compatibleWith(partitioningB)) + assert(!partitioningB.compatibleWith(partitioningA)) + assert(!partitioningA.guarantees(partitioningB)) + assert(!partitioningB.guarantees(partitioningA)) + + // Just to be sure that we haven't cheated by having these methods always return false, + // check that identical partitionings are still compatible with and guarantee each other: + assert(partitioningA === partitioningA) + assert(partitioningA.guarantees(partitioningA)) + assert(partitioningA.compatibleWith(partitioningA)) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index 26935c6e3b24..fbdd3a7776f5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -17,14 +17,10 @@ package org.apache.spark.sql.catalyst.analysis -import org.scalatest.BeforeAndAfter - -import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.Inner -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.types._ @@ -42,8 +38,8 @@ case class UnresolvedTestPlan() extends LeafNode { override def output: Seq[Attribute] = Nil } -class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter { - import AnalysisSuite._ +class AnalysisErrorSuite extends AnalysisTest { + import TestRelations._ def errorTest( name: String, @@ -51,15 +47,7 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter { errorMessages: Seq[String], caseSensitive: Boolean = true): Unit = { test(name) { - val error = intercept[AnalysisException] { - if (caseSensitive) { - caseSensitiveAnalyze(plan) - } else { - caseInsensitiveAnalyze(plan) - } - } - - errorMessages.foreach(m => assert(error.getMessage.toLowerCase.contains(m.toLowerCase))) + assertAnalysisError(plan, errorMessages, caseSensitive) } } @@ -69,21 +57,21 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter { "single invalid type, single arg", testRelation.select(TestFunction(dateLit :: Nil, IntegerType :: Nil).as('a)), "cannot resolve" :: "testfunction" :: "argument 1" :: "requires int type" :: - "'null' is of date type" ::Nil) + "'null' is of date type" :: Nil) errorTest( "single invalid type, second arg", testRelation.select( TestFunction(dateLit :: dateLit :: Nil, DateType :: IntegerType :: Nil).as('a)), "cannot resolve" :: "testfunction" :: "argument 2" :: "requires int type" :: - "'null' is of date type" ::Nil) + "'null' is of date type" :: Nil) errorTest( "multiple invalid type", testRelation.select( TestFunction(dateLit :: dateLit :: Nil, IntegerType :: IntegerType :: Nil).as('a)), "cannot resolve" :: "testfunction" :: "argument 1" :: "argument 2" :: - "requires int type" :: "'null' is of date type" ::Nil) + "requires int type" :: "'null' is of date type" :: Nil) errorTest( "unresolved window function", @@ -157,6 +145,29 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter { UnresolvedTestPlan(), "unresolved" :: Nil) + errorTest( + "union with unequal number of columns", + testRelation.unionAll(testRelation2), + "union" :: "number of columns" :: testRelation2.output.length.toString :: + testRelation.output.length.toString :: Nil) + + errorTest( + "intersect with unequal number of columns", + testRelation.intersect(testRelation2), + "intersect" :: "number of columns" :: testRelation2.output.length.toString :: + testRelation.output.length.toString :: Nil) + + errorTest( + "except with unequal number of columns", + testRelation.except(testRelation2), + "except" :: "number of columns" :: testRelation2.output.length.toString :: + testRelation.output.length.toString :: Nil) + + errorTest( + "SPARK-9955: correct error message for aggregate", + // When parse SQL string, we will wrap aggregate expressions with UnresolvedAlias. + testRelation2.where('bad_column > 1).groupBy('a)(UnresolvedAlias(max('b))), + "cannot resolve 'bad_column'" :: Nil) test("SPARK-6452 regression test") { // CheckAnalysis should throw AnalysisException when Aggregate contains missing attribute(s) @@ -169,11 +180,7 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter { assert(plan.resolved) - val message = intercept[AnalysisException] { - caseSensitiveAnalyze(plan) - }.getMessage - - assert(message.contains("resolved attribute(s) a#1 missing from a#2")) + assertAnalysisError(plan, "resolved attribute(s) a#1 missing from a#2" :: Nil) } test("error test for self-join") { @@ -194,10 +201,8 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter { AttributeReference("a", BinaryType)(exprId = ExprId(2)), AttributeReference("b", IntegerType)(exprId = ExprId(1)))) - val error = intercept[AnalysisException] { - caseSensitiveAnalyze(plan) - } - assert(error.message.contains("binary type expression a cannot be used in grouping expression")) + assertAnalysisError(plan, + "binary type expression a cannot be used in grouping expression" :: Nil) val plan2 = Aggregate( @@ -207,10 +212,8 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter { AttributeReference("a", MapType(IntegerType, StringType))(exprId = ExprId(2)), AttributeReference("b", IntegerType)(exprId = ExprId(1)))) - val error2 = intercept[AnalysisException] { - caseSensitiveAnalyze(plan2) - } - assert(error2.message.contains("map type expression a cannot be used in grouping expression")) + assertAnalysisError(plan2, + "map type expression a cannot be used in grouping expression" :: Nil) } test("Join can't work on binary and map types") { @@ -226,10 +229,7 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter { Some(EqualTo(AttributeReference("a", BinaryType)(exprId = ExprId(2)), AttributeReference("c", BinaryType)(exprId = ExprId(4))))) - val error = intercept[AnalysisException] { - caseSensitiveAnalyze(plan) - } - assert(error.message.contains("binary type expression a cannot be used in join conditions")) + assertAnalysisError(plan, "binary type expression a cannot be used in join conditions" :: Nil) val plan2 = Join( @@ -243,9 +243,6 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter { Some(EqualTo(AttributeReference("a", MapType(IntegerType, StringType))(exprId = ExprId(2)), AttributeReference("c", MapType(IntegerType, StringType))(exprId = ExprId(4))))) - val error2 = intercept[AnalysisException] { - caseSensitiveAnalyze(plan2) - } - assert(error2.message.contains("map type expression a cannot be used in join conditions")) + assertAnalysisError(plan2, "map type expression a cannot be used in join conditions" :: Nil) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index a86cefe941e8..820b336aac75 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -17,68 +17,14 @@ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.catalyst.SimpleCatalystConf -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.dsl.plans._ - -// todo: remove this and use AnalysisTest instead. -object AnalysisSuite { - val caseSensitiveConf = new SimpleCatalystConf(true) - val caseInsensitiveConf = new SimpleCatalystConf(false) - - val caseSensitiveCatalog = new SimpleCatalog(caseSensitiveConf) - val caseInsensitiveCatalog = new SimpleCatalog(caseInsensitiveConf) - - val caseSensitiveAnalyzer = - new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitiveConf) { - override val extendedResolutionRules = EliminateSubQueries :: Nil - } - val caseInsensitiveAnalyzer = - new Analyzer(caseInsensitiveCatalog, EmptyFunctionRegistry, caseInsensitiveConf) { - override val extendedResolutionRules = EliminateSubQueries :: Nil - } - - def caseSensitiveAnalyze(plan: LogicalPlan): Unit = - caseSensitiveAnalyzer.checkAnalysis(caseSensitiveAnalyzer.execute(plan)) - - def caseInsensitiveAnalyze(plan: LogicalPlan): Unit = - caseInsensitiveAnalyzer.checkAnalysis(caseInsensitiveAnalyzer.execute(plan)) - - val testRelation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)()) - val testRelation2 = LocalRelation( - AttributeReference("a", StringType)(), - AttributeReference("b", StringType)(), - AttributeReference("c", DoubleType)(), - AttributeReference("d", DecimalType(10, 2))(), - AttributeReference("e", ShortType)()) - - val nestedRelation = LocalRelation( - AttributeReference("top", StructType( - StructField("duplicateField", StringType) :: - StructField("duplicateField", StringType) :: - StructField("differentCase", StringType) :: - StructField("differentcase", StringType) :: Nil - ))()) - - val nestedRelation2 = LocalRelation( - AttributeReference("top", StructType( - StructField("aField", StringType) :: - StructField("bField", StringType) :: - StructField("cField", StringType) :: Nil - ))()) - - val listRelation = LocalRelation( - AttributeReference("list", ArrayType(IntegerType))()) - - caseSensitiveCatalog.registerTable(Seq("TaBlE"), testRelation) - caseInsensitiveCatalog.registerTable(Seq("TaBlE"), testRelation) -} - class AnalysisSuite extends AnalysisTest { + import org.apache.spark.sql.catalyst.analysis.TestRelations._ test("union project *") { val plan = (1 to 100) @@ -149,7 +95,7 @@ class AnalysisSuite extends AnalysisTest { assert(pl(1).dataType == DoubleType) assert(pl(2).dataType == DoubleType) // StringType will be promoted into Decimal(38, 18) - assert(pl(3).dataType == DecimalType(38, 29)) + assert(pl(3).dataType == DecimalType(38, 22)) assert(pl(4).dataType == DoubleType) } @@ -165,39 +111,28 @@ class AnalysisSuite extends AnalysisTest { test("pull out nondeterministic expressions from Sort") { val plan = Sort(Seq(SortOrder(Rand(33), Ascending)), false, testRelation) - val analyzed = caseSensitiveAnalyzer.execute(plan) - analyzed.transform { - case s: Sort if s.expressions.exists(!_.deterministic) => - fail("nondeterministic expressions are not allowed in Sort") - } + val projected = Alias(Rand(33), "_nondeterministic")() + val expected = + Project(testRelation.output, + Sort(Seq(SortOrder(projected.toAttribute, Ascending)), false, + Project(testRelation.output :+ projected, testRelation))) + checkAnalysis(plan, expected) } - test("remove still-need-evaluate ordering expressions from sort") { - val a = testRelation2.output(0) - val b = testRelation2.output(1) - - def makeOrder(e: Expression): SortOrder = SortOrder(e, Ascending) - - val noEvalOrdering = makeOrder(a) - val noEvalOrderingWithAlias = makeOrder(Alias(Alias(b, "name1")(), "name2")()) - - val needEvalExpr = Coalesce(Seq(a, Literal("1"))) - val needEvalExpr2 = Coalesce(Seq(a, b)) - val needEvalOrdering = makeOrder(needEvalExpr) - val needEvalOrdering2 = makeOrder(needEvalExpr2) - - val plan = Sort( - Seq(noEvalOrdering, noEvalOrderingWithAlias, needEvalOrdering, needEvalOrdering2), - false, testRelation2) - - val evaluatedOrdering = makeOrder(AttributeReference("_sortCondition", StringType)()) - val materializedExprs = Seq(needEvalExpr, needEvalExpr2).map(e => Alias(e, "_sortCondition")()) - - val expected = - Project(testRelation2.output, - Sort(Seq(makeOrder(a), makeOrder(b), evaluatedOrdering, evaluatedOrdering), false, - Project(testRelation2.output ++ materializedExprs, testRelation2))) + test("SPARK-9634: cleanup unnecessary Aliases in LogicalPlan") { + val a = testRelation.output.head + var plan = testRelation.select(((a + 1).as("a+1") + 2).as("col")) + var expected = testRelation.select((a + 1 + 2).as("col")) + checkAnalysis(plan, expected) + plan = testRelation.groupBy(a.as("a1").as("a2"))((min(a).as("min_a") + 1).as("col")) + expected = testRelation.groupBy(a)((min(a) + 1).as("col")) checkAnalysis(plan, expected) + + // CreateStruct is a special case that we should not trim Alias for it. + plan = testRelation.select(CreateStruct(Seq(a, (a + 1).as("a+1"))).as("col")) + checkAnalysis(plan, plan) + plan = testRelation.select(CreateStructUnsafe(Seq(a, (a + 1).as("a+1"))).as("col")) + checkAnalysis(plan, plan) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala index fdb4f28950da..53b3695a86be 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala @@ -17,40 +17,11 @@ package org.apache.spark.sql.catalyst.analysis -import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.SimpleCatalystConf -import org.apache.spark.sql.types._ trait AnalysisTest extends PlanTest { - val testRelation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)()) - - val testRelation2 = LocalRelation( - AttributeReference("a", StringType)(), - AttributeReference("b", StringType)(), - AttributeReference("c", DoubleType)(), - AttributeReference("d", DecimalType(10, 2))(), - AttributeReference("e", ShortType)()) - - val nestedRelation = LocalRelation( - AttributeReference("top", StructType( - StructField("duplicateField", StringType) :: - StructField("duplicateField", StringType) :: - StructField("differentCase", StringType) :: - StructField("differentcase", StringType) :: Nil - ))()) - - val nestedRelation2 = LocalRelation( - AttributeReference("top", StructType( - StructField("aField", StringType) :: - StructField("bField", StringType) :: - StructField("cField", StringType) :: Nil - ))()) - - val listRelation = LocalRelation( - AttributeReference("list", ArrayType(IntegerType))()) val (caseSensitiveAnalyzer, caseInsensitiveAnalyzer) = { val caseSensitiveConf = new SimpleCatalystConf(true) @@ -59,8 +30,8 @@ trait AnalysisTest extends PlanTest { val caseSensitiveCatalog = new SimpleCatalog(caseSensitiveConf) val caseInsensitiveCatalog = new SimpleCatalog(caseInsensitiveConf) - caseSensitiveCatalog.registerTable(Seq("TaBlE"), testRelation) - caseInsensitiveCatalog.registerTable(Seq("TaBlE"), testRelation) + caseSensitiveCatalog.registerTable(Seq("TaBlE"), TestRelations.testRelation) + caseInsensitiveCatalog.registerTable(Seq("TaBlE"), TestRelations.testRelation) new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitiveConf) { override val extendedResolutionRules = EliminateSubQueries :: Nil @@ -100,6 +71,8 @@ trait AnalysisTest extends PlanTest { val e = intercept[Exception] { analyzer.checkAnalysis(analyzer.execute(inputPlan)) } - expectedErrors.forall(e.getMessage.contains) + assert(expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains), + s"Expected to throw Exception contains: ${expectedErrors.mkString(", ")}, " + + s"actually we get ${e.getMessage}") } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala index fc11627da6fd..b4ad618c23e3 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala @@ -136,10 +136,10 @@ class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter { checkType(Multiply(i, u), DecimalType(38, 18)) checkType(Multiply(u, u), DecimalType(38, 36)) - checkType(Divide(u, d1), DecimalType(38, 21)) - checkType(Divide(u, d2), DecimalType(38, 24)) - checkType(Divide(u, i), DecimalType(38, 29)) - checkType(Divide(u, u), DecimalType(38, 38)) + checkType(Divide(u, d1), DecimalType(38, 18)) + checkType(Divide(u, d2), DecimalType(38, 19)) + checkType(Divide(u, i), DecimalType(38, 23)) + checkType(Divide(u, u), DecimalType(38, 18)) checkType(Remainder(d1, u), DecimalType(19, 18)) checkType(Remainder(d2, u), DecimalType(21, 18)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala index cbdf453f600a..6f33ab733b61 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala @@ -285,6 +285,17 @@ class HiveTypeCoercionSuite extends PlanTest { CaseKeyWhen(Literal(true), Seq(Literal(1), Literal("a"))), CaseKeyWhen(Literal(true), Seq(Literal(1), Literal("a"))) ) + ruleTest(HiveTypeCoercion.CaseWhenCoercion, + CaseWhen(Seq(Literal(true), Literal(1.2), Literal.create(1, DecimalType(7, 2)))), + CaseWhen(Seq( + Literal(true), Literal(1.2), Cast(Literal.create(1, DecimalType(7, 2)), DoubleType))) + ) + ruleTest(HiveTypeCoercion.CaseWhenCoercion, + CaseWhen(Seq(Literal(true), Literal(100L), Literal.create(1, DecimalType(7, 2)))), + CaseWhen(Seq( + Literal(true), Cast(Literal(100L), DecimalType(22, 2)), + Cast(Literal.create(1, DecimalType(7, 2)), DecimalType(22, 2)))) + ) } test("type coercion simplification for equal to") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TestRelations.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TestRelations.scala new file mode 100644 index 000000000000..05b870705e7e --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TestRelations.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.expressions.AttributeReference +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation +import org.apache.spark.sql.types._ + +object TestRelations { + val testRelation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)()) + + val testRelation2 = LocalRelation( + AttributeReference("a", StringType)(), + AttributeReference("b", StringType)(), + AttributeReference("c", DoubleType)(), + AttributeReference("d", DecimalType(10, 2))(), + AttributeReference("e", ShortType)()) + + val nestedRelation = LocalRelation( + AttributeReference("top", StructType( + StructField("duplicateField", StringType) :: + StructField("duplicateField", StringType) :: + StructField("differentCase", StringType) :: + StructField("differentcase", StringType) :: Nil + ))()) + + val nestedRelation2 = LocalRelation( + AttributeReference("top", StructType( + StructField("aField", StringType) :: + StructField("bField", StringType) :: + StructField("cField", StringType) :: Nil + ))()) + + val listRelation = LocalRelation( + AttributeReference("list", ArrayType(IntegerType))()) +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala index 0bae8fe2fd8a..72285c6a2419 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala @@ -23,6 +23,8 @@ import org.apache.spark.sql.types._ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { + import IntegralLiteralTestUtils._ + /** * Runs through the testFunc for all numeric data types. * @@ -47,6 +49,13 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(Add(Literal.create(null, left.dataType), right), null) checkEvaluation(Add(left, Literal.create(null, right.dataType)), null) } + checkEvaluation(Add(positiveShortLit, negativeShortLit), -1.toShort) + checkEvaluation(Add(positiveIntLit, negativeIntLit), -1) + checkEvaluation(Add(positiveLongLit, negativeLongLit), -1L) + + DataTypeTestUtils.numericAndInterval.foreach { tpe => + checkConsistencyBetweenInterpretedAndCodegen(Add, tpe, tpe) + } } test("- (UnaryMinus)") { @@ -60,6 +69,16 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(UnaryMinus(Literal(Int.MinValue)), Int.MinValue) checkEvaluation(UnaryMinus(Literal(Short.MinValue)), Short.MinValue) checkEvaluation(UnaryMinus(Literal(Byte.MinValue)), Byte.MinValue) + checkEvaluation(UnaryMinus(positiveShortLit), (- positiveShort).toShort) + checkEvaluation(UnaryMinus(negativeShortLit), (- negativeShort).toShort) + checkEvaluation(UnaryMinus(positiveIntLit), - positiveInt) + checkEvaluation(UnaryMinus(negativeIntLit), - negativeInt) + checkEvaluation(UnaryMinus(positiveLongLit), - positiveLong) + checkEvaluation(UnaryMinus(negativeLongLit), - negativeLong) + + DataTypeTestUtils.numericAndInterval.foreach { tpe => + checkConsistencyBetweenInterpretedAndCodegen(UnaryMinus, tpe) + } } test("- (Minus)") { @@ -70,6 +89,14 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(Subtract(Literal.create(null, left.dataType), right), null) checkEvaluation(Subtract(left, Literal.create(null, right.dataType)), null) } + checkEvaluation(Subtract(positiveShortLit, negativeShortLit), + (positiveShort - negativeShort).toShort) + checkEvaluation(Subtract(positiveIntLit, negativeIntLit), positiveInt - negativeInt) + checkEvaluation(Subtract(positiveLongLit, negativeLongLit), positiveLong - negativeLong) + + DataTypeTestUtils.numericAndInterval.foreach { tpe => + checkConsistencyBetweenInterpretedAndCodegen(Subtract, tpe, tpe) + } } test("* (Multiply)") { @@ -80,6 +107,14 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(Multiply(Literal.create(null, left.dataType), right), null) checkEvaluation(Multiply(left, Literal.create(null, right.dataType)), null) } + checkEvaluation(Multiply(positiveShortLit, negativeShortLit), + (positiveShort * negativeShort).toShort) + checkEvaluation(Multiply(positiveIntLit, negativeIntLit), positiveInt * negativeInt) + checkEvaluation(Multiply(positiveLongLit, negativeLongLit), positiveLong * negativeLong) + + DataTypeTestUtils.numericTypeWithoutDecimal.foreach { tpe => + checkConsistencyBetweenInterpretedAndCodegen(Multiply, tpe, tpe) + } } test("/ (Divide) basic") { @@ -92,6 +127,10 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(Divide(left, Literal.create(null, right.dataType)), null) checkEvaluation(Divide(left, Literal(convert(0))), null) // divide by zero } + + DataTypeTestUtils.numericTypeWithoutDecimal.foreach { tpe => + checkConsistencyBetweenInterpretedAndCodegen(Divide, tpe, tpe) + } } test("/ (Divide) for integral type") { @@ -99,6 +138,9 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(Divide(Literal(1.toShort), Literal(2.toShort)), 0.toShort) checkEvaluation(Divide(Literal(1), Literal(2)), 0) checkEvaluation(Divide(Literal(1.toLong), Literal(2.toLong)), 0.toLong) + checkEvaluation(Divide(positiveShortLit, negativeShortLit), 0.toShort) + checkEvaluation(Divide(positiveIntLit, negativeIntLit), 0) + checkEvaluation(Divide(positiveLongLit, negativeLongLit), 0L) } test("/ (Divide) for floating point") { @@ -116,6 +158,18 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(Remainder(left, Literal.create(null, right.dataType)), null) checkEvaluation(Remainder(left, Literal(convert(0))), null) // mod by 0 } + checkEvaluation(Remainder(positiveShortLit, positiveShortLit), 0.toShort) + checkEvaluation(Remainder(negativeShortLit, negativeShortLit), 0.toShort) + checkEvaluation(Remainder(positiveIntLit, positiveIntLit), 0) + checkEvaluation(Remainder(negativeIntLit, negativeIntLit), 0) + checkEvaluation(Remainder(positiveLongLit, positiveLongLit), 0L) + checkEvaluation(Remainder(negativeLongLit, negativeLongLit), 0L) + + // TODO: the following lines would fail the test due to inconsistency result of interpret + // and codegen for remainder between giant values, seems like a numeric stability issue + // DataTypeTestUtils.numericTypeWithoutDecimal.foreach { tpe => + // checkConsistencyBetweenInterpretedAndCodegen(Remainder, tpe, tpe) + // } } test("Abs") { @@ -127,6 +181,16 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(Abs(Literal(convert(-1))), convert(1)) checkEvaluation(Abs(Literal.create(null, dataType)), null) } + checkEvaluation(Abs(positiveShortLit), positiveShort) + checkEvaluation(Abs(negativeShortLit), (- negativeShort).toShort) + checkEvaluation(Abs(positiveIntLit), positiveInt) + checkEvaluation(Abs(negativeIntLit), - negativeInt) + checkEvaluation(Abs(positiveLongLit), positiveLong) + checkEvaluation(Abs(negativeLongLit), - negativeLong) + + DataTypeTestUtils.numericTypeWithoutDecimal.foreach { tpe => + checkConsistencyBetweenInterpretedAndCodegen(Abs, tpe) + } } test("MaxOf basic") { @@ -138,6 +202,13 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(MaxOf(Literal.create(null, small.dataType), large), convert(2)) checkEvaluation(MaxOf(large, Literal.create(null, small.dataType)), convert(2)) } + checkEvaluation(MaxOf(positiveShortLit, negativeShortLit), (positiveShort).toShort) + checkEvaluation(MaxOf(positiveIntLit, negativeIntLit), positiveInt) + checkEvaluation(MaxOf(positiveLongLit, negativeLongLit), positiveLong) + + DataTypeTestUtils.ordered.foreach { tpe => + checkConsistencyBetweenInterpretedAndCodegen(MaxOf, tpe, tpe) + } } test("MaxOf for atomic type") { @@ -156,6 +227,13 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(MinOf(Literal.create(null, small.dataType), large), convert(2)) checkEvaluation(MinOf(small, Literal.create(null, small.dataType)), convert(1)) } + checkEvaluation(MinOf(positiveShortLit, negativeShortLit), (negativeShort).toShort) + checkEvaluation(MinOf(positiveIntLit, negativeIntLit), negativeInt) + checkEvaluation(MinOf(positiveLongLit, negativeLongLit), negativeLong) + + DataTypeTestUtils.ordered.foreach { tpe => + checkConsistencyBetweenInterpretedAndCodegen(MinOf, tpe, tpe) + } } test("MinOf for atomic type") { @@ -174,9 +252,16 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(Pmod(left, Literal.create(null, right.dataType)), null) checkEvaluation(Remainder(left, Literal(convert(0))), null) // mod by 0 } - checkEvaluation(Pmod(-7, 3), 2) - checkEvaluation(Pmod(7.2D, 4.1D), 3.1000000000000005) - checkEvaluation(Pmod(Decimal(0.7), Decimal(0.2)), Decimal(0.1)) - checkEvaluation(Pmod(2L, Long.MaxValue), 2L) + checkEvaluation(Pmod(Literal(-7), Literal(3)), 2) + checkEvaluation(Pmod(Literal(7.2D), Literal(4.1D)), 3.1000000000000005) + checkEvaluation(Pmod(Literal(Decimal(0.7)), Literal(Decimal(0.2))), Decimal(0.1)) + checkEvaluation(Pmod(Literal(2L), Literal(Long.MaxValue)), 2L) + checkEvaluation(Pmod(positiveShort, negativeShort), positiveShort.toShort) + checkEvaluation(Pmod(positiveInt, negativeInt), positiveInt) + checkEvaluation(Pmod(positiveLong, negativeLong), positiveLong) + } + + DataTypeTestUtils.numericTypeWithoutDecimal.foreach { tpe => + checkConsistencyBetweenInterpretedAndCodegen(MinOf, tpe, tpe) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala index fa30fbe52847..3a310c0e9a7a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala @@ -23,6 +23,8 @@ import org.apache.spark.sql.types._ class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { + import IntegralLiteralTestUtils._ + test("BitwiseNOT") { def check(input: Any, expected: Any): Unit = { val expr = BitwiseNot(Literal(input)) @@ -37,6 +39,16 @@ class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { check(123456789123L, ~123456789123L) checkEvaluation(BitwiseNot(Literal.create(null, IntegerType)), null) + checkEvaluation(BitwiseNot(positiveShortLit), (~positiveShort).toShort) + checkEvaluation(BitwiseNot(negativeShortLit), (~negativeShort).toShort) + checkEvaluation(BitwiseNot(positiveIntLit), ~positiveInt) + checkEvaluation(BitwiseNot(negativeIntLit), ~negativeInt) + checkEvaluation(BitwiseNot(positiveLongLit), ~positiveLong) + checkEvaluation(BitwiseNot(negativeLongLit), ~negativeLong) + + DataTypeTestUtils.integralType.foreach { dt => + checkConsistencyBetweenInterpretedAndCodegen(BitwiseNot, dt) + } } test("BitwiseAnd") { @@ -56,6 +68,14 @@ class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(BitwiseAnd(nullLit, Literal(1)), null) checkEvaluation(BitwiseAnd(Literal(1), nullLit), null) checkEvaluation(BitwiseAnd(nullLit, nullLit), null) + checkEvaluation(BitwiseAnd(positiveShortLit, negativeShortLit), + (positiveShort & negativeShort).toShort) + checkEvaluation(BitwiseAnd(positiveIntLit, negativeIntLit), positiveInt & negativeInt) + checkEvaluation(BitwiseAnd(positiveLongLit, negativeLongLit), positiveLong & negativeLong) + + DataTypeTestUtils.integralType.foreach { dt => + checkConsistencyBetweenInterpretedAndCodegen(BitwiseAnd, dt, dt) + } } test("BitwiseOr") { @@ -75,6 +95,14 @@ class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(BitwiseOr(nullLit, Literal(1)), null) checkEvaluation(BitwiseOr(Literal(1), nullLit), null) checkEvaluation(BitwiseOr(nullLit, nullLit), null) + checkEvaluation(BitwiseOr(positiveShortLit, negativeShortLit), + (positiveShort | negativeShort).toShort) + checkEvaluation(BitwiseOr(positiveIntLit, negativeIntLit), positiveInt | negativeInt) + checkEvaluation(BitwiseOr(positiveLongLit, negativeLongLit), positiveLong | negativeLong) + + DataTypeTestUtils.integralType.foreach { dt => + checkConsistencyBetweenInterpretedAndCodegen(BitwiseOr, dt, dt) + } } test("BitwiseXor") { @@ -94,5 +122,13 @@ class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(BitwiseXor(nullLit, Literal(1)), null) checkEvaluation(BitwiseXor(Literal(1), nullLit), null) checkEvaluation(BitwiseXor(nullLit, nullLit), null) + checkEvaluation(BitwiseXor(positiveShortLit, negativeShortLit), + (positiveShort ^ negativeShort).toShort) + checkEvaluation(BitwiseXor(positiveIntLit, negativeIntLit), positiveInt ^ negativeInt) + checkEvaluation(BitwiseXor(positiveLongLit, negativeLongLit), positiveLong ^ negativeLong) + + DataTypeTestUtils.integralType.foreach { dt => + checkConsistencyBetweenInterpretedAndCodegen(BitwiseXor, dt, dt) + } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index cc82f7c3f5a7..e323467af5f4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -54,7 +54,7 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { // GenerateOrdering agrees with RowOrdering. (DataTypeTestUtils.atomicTypes ++ Set(NullType)).foreach { dataType => test(s"GenerateOrdering with $dataType") { - val rowOrdering = RowOrdering.forSchema(Seq(dataType, dataType)) + val rowOrdering = InterpretedOrdering.forSchema(Seq(dataType, dataType)) val genOrdering = GenerateOrdering.generate( BoundReference(0, dataType, nullable = true).asc :: BoundReference(1, dataType, nullable = true).asc :: Nil) @@ -87,7 +87,7 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { val length = 5000 val expressions = List.fill(length)(EqualTo(Literal(1), Literal(1))) val plan = GenerateMutableProjection.generate(expressions)() - val actual = plan(new GenericMutableRow(length)).toSeq + val actual = plan(new GenericMutableRow(length)).toSeq(expressions.map(_.dataType)) val expected = Seq.fill(length)(true) if (!checkResult(actual, expected)) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala index 2c7e85c446ec..a3e81888dfd0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala @@ -65,4 +65,25 @@ class CollectionFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Literal.create(null, ArrayType(StringType)), null) } + + test("Array contains") { + val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType)) + val a1 = Literal.create(Seq[String](null, ""), ArrayType(StringType)) + val a2 = Literal.create(Seq(null), ArrayType(LongType)) + val a3 = Literal.create(null, ArrayType(StringType)) + + checkEvaluation(ArrayContains(a0, Literal(1)), true) + checkEvaluation(ArrayContains(a0, Literal(0)), false) + checkEvaluation(ArrayContains(a0, Literal.create(null, IntegerType)), null) + + checkEvaluation(ArrayContains(a1, Literal("")), true) + checkEvaluation(ArrayContains(a1, Literal("a")), null) + checkEvaluation(ArrayContains(a1, Literal.create(null, StringType)), null) + + checkEvaluation(ArrayContains(a2, Literal(1L)), null) + checkEvaluation(ArrayContains(a2, Literal.create(null, LongType)), null) + + checkEvaluation(ArrayContains(a3, Literal("")), null) + checkEvaluation(ArrayContains(a3, Literal.create(null, StringType)), null) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala index d26bcdb2902a..0df673bb9fa0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala @@ -66,6 +66,10 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper testIf(_.toLong, TimestampType) testIf(_.toString, StringType) + + DataTypeTestUtils.propertyCheckSupported.foreach { dt => + checkConsistencyBetweenInterpretedAndCodegen(If, BooleanType, dt, dt) + } } test("case when") { @@ -176,6 +180,10 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper Literal(Timestamp.valueOf("2015-07-01 08:00:00")), Literal(Timestamp.valueOf("2015-07-01 10:00:00")))), Timestamp.valueOf("2015-07-01 08:00:00"), InternalRow.empty) + + DataTypeTestUtils.ordered.foreach { dt => + checkConsistencyBetweenInterpretedAndCodegen(Least, dt, 2) + } } test("function greatest") { @@ -218,6 +226,9 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper Literal(Timestamp.valueOf("2015-07-01 08:00:00")), Literal(Timestamp.valueOf("2015-07-01 10:00:00")))), Timestamp.valueOf("2015-07-01 10:00:00"), InternalRow.empty) - } + DataTypeTestUtils.ordered.foreach { dt => + checkConsistencyBetweenInterpretedAndCodegen(Greatest, dt, 2) + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index e6e8790e9092..610d39e8493c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -28,6 +28,8 @@ import org.apache.spark.unsafe.types.CalendarInterval class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { + import IntegralLiteralTestUtils._ + val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") val sdfDate = new SimpleDateFormat("yyyy-MM-dd") val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime) @@ -58,6 +60,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } } checkEvaluation(DayOfYear(Literal.create(null, DateType)), null) + checkConsistencyBetweenInterpretedAndCodegen(DayOfYear, DateType) } test("Year") { @@ -77,6 +80,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } } } + checkConsistencyBetweenInterpretedAndCodegen(Year, DateType) } test("Quarter") { @@ -96,6 +100,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } } } + checkConsistencyBetweenInterpretedAndCodegen(Quarter, DateType) } test("Month") { @@ -115,6 +120,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } } } + checkConsistencyBetweenInterpretedAndCodegen(Month, DateType) } test("Day / DayOfMonth") { @@ -133,6 +139,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { c.get(Calendar.DAY_OF_MONTH)) } } + checkConsistencyBetweenInterpretedAndCodegen(DayOfMonth, DateType) } test("Seconds") { @@ -147,6 +154,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Second(Literal(new Timestamp(c.getTimeInMillis))), c.get(Calendar.SECOND)) } + checkConsistencyBetweenInterpretedAndCodegen(Second, TimestampType) } test("WeekOfYear") { @@ -155,6 +163,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(WeekOfYear(Cast(Literal(sdfDate.format(d)), DateType)), 15) checkEvaluation(WeekOfYear(Cast(Literal(ts), DateType)), 45) checkEvaluation(WeekOfYear(Cast(Literal("2011-05-06"), DateType)), 18) + checkConsistencyBetweenInterpretedAndCodegen(WeekOfYear, DateType) } test("DateFormat") { @@ -182,6 +191,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } } } + checkConsistencyBetweenInterpretedAndCodegen(Hour, TimestampType) } test("Minute") { @@ -198,6 +208,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { c.get(Calendar.MINUTE)) } } + checkConsistencyBetweenInterpretedAndCodegen(Minute, TimestampType) } test("date_add") { @@ -212,6 +223,11 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { null) checkEvaluation(DateAdd(Literal.create(null, DateType), Literal.create(null, IntegerType)), null) + checkEvaluation( + DateAdd(Literal(Date.valueOf("2016-02-28")), positiveIntLit), 49627) + checkEvaluation( + DateAdd(Literal(Date.valueOf("2016-02-28")), negativeIntLit), -15910) + checkConsistencyBetweenInterpretedAndCodegen(DateAdd, DateType, IntegerType) } test("date_sub") { @@ -226,6 +242,11 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { null) checkEvaluation(DateSub(Literal.create(null, DateType), Literal.create(null, IntegerType)), null) + checkEvaluation( + DateSub(Literal(Date.valueOf("2016-02-28")), positiveIntLit), -15909) + checkEvaluation( + DateSub(Literal(Date.valueOf("2016-02-28")), negativeIntLit), 49628) + checkConsistencyBetweenInterpretedAndCodegen(DateSub, DateType, IntegerType) } test("time_add") { @@ -244,6 +265,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation( TimeAdd(Literal.create(null, TimestampType), Literal.create(null, CalendarIntervalType)), null) + checkConsistencyBetweenInterpretedAndCodegen(TimeAdd, TimestampType, CalendarIntervalType) } test("time_sub") { @@ -267,6 +289,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation( TimeSub(Literal.create(null, TimestampType), Literal.create(null, CalendarIntervalType)), null) + checkConsistencyBetweenInterpretedAndCodegen(TimeSub, TimestampType, CalendarIntervalType) } test("add_months") { @@ -282,6 +305,11 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { null) checkEvaluation( AddMonths(Literal(Date.valueOf("2015-01-30")), Literal(Int.MinValue)), -7293498) + checkEvaluation( + AddMonths(Literal(Date.valueOf("2016-02-28")), positiveIntLit), 1014213) + checkEvaluation( + AddMonths(Literal(Date.valueOf("2016-02-28")), negativeIntLit), -980528) + checkConsistencyBetweenInterpretedAndCodegen(AddMonths, DateType, IntegerType) } test("months_between") { @@ -306,6 +334,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(MonthsBetween(t, tnull), null) checkEvaluation(MonthsBetween(tnull, t), null) checkEvaluation(MonthsBetween(tnull, tnull), null) + checkConsistencyBetweenInterpretedAndCodegen(MonthsBetween, TimestampType, TimestampType) } test("last_day") { @@ -323,6 +352,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(LastDay(Literal(Date.valueOf("2016-01-06"))), Date.valueOf("2016-01-31")) checkEvaluation(LastDay(Literal(Date.valueOf("2016-02-07"))), Date.valueOf("2016-02-29")) checkEvaluation(LastDay(Literal.create(null, DateType)), null) + checkConsistencyBetweenInterpretedAndCodegen(LastDay, DateType) } test("next_day") { @@ -356,6 +386,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { ToDate(Literal(Date.valueOf("2015-07-22"))), DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-22"))) checkEvaluation(ToDate(Literal.create(null, DateType)), null) + checkConsistencyBetweenInterpretedAndCodegen(ToDate, DateType) } test("function trunc") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DecimalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DecimalExpressionSuite.scala new file mode 100644 index 000000000000..511f0307901d --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DecimalExpressionSuite.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.types.{LongType, DecimalType, Decimal} + + +class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { + + test("UnscaledValue") { + val d1 = Decimal("10.1") + checkEvaluation(UnscaledValue(Literal(d1)), 101L) + val d2 = Decimal(101, 3, 1) + checkEvaluation(UnscaledValue(Literal(d2)), 101L) + checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null) + } + + test("MakeDecimal") { + checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1")) + checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null) + } + + test("PromotePrecision") { + val d1 = Decimal("10.1") + checkEvaluation(PromotePrecision(Literal(d1)), d1) + val d2 = Decimal(101, 3, 1) + checkEvaluation(PromotePrecision(Literal(d2)), d2) + checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null) + } + + test("CheckOverflow") { + val d1 = Decimal("10.1") + checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10")) + checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1) + checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1) + checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null) + + val d2 = Decimal(101, 3, 1) + checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10")) + checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2) + checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2) + checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null) + + checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null) + } + +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index 3e5515129874..465f7d08aa14 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -17,22 +17,23 @@ package org.apache.spark.sql.catalyst.expressions +import org.scalacheck.Gen import org.scalactic.TripleEqualsSupport.Spread +import org.scalatest.prop.GeneratorDrivenPropertyChecks import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project} +import org.apache.spark.sql.types.DataType /** * A few helper functions for expression evaluation testing. Mixin this trait to use them. */ -trait ExpressionEvalHelper { +trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks { self: SparkFunSuite => - protected val defaultOptimizer = new DefaultOptimizer - protected def create_row(values: Any*): InternalRow = { InternalRow.fromSeq(values.map(CatalystTypeConverters.convertToCatalyst)) } @@ -188,7 +189,7 @@ trait ExpressionEvalHelper { expected: Any, inputRow: InternalRow = EmptyRow): Unit = { val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation) - val optimizedPlan = defaultOptimizer.execute(plan) + val optimizedPlan = DefaultOptimizer.execute(plan) checkEvaluationWithoutCodegen(optimizedPlan.expressions.head, expected, inputRow) } @@ -213,4 +214,111 @@ trait ExpressionEvalHelper { plan(inputRow)).get(0, expression.dataType) assert(checkResult(actual, expected)) } + + /** + * Test evaluation results between Interpreted mode and Codegen mode, making sure we have + * consistent result regardless of the evaluation method we use. + * + * This method test against unary expressions by feeding them arbitrary literals of `dataType`. + */ + def checkConsistencyBetweenInterpretedAndCodegen( + c: Expression => Expression, + dataType: DataType): Unit = { + forAll (LiteralGenerator.randomGen(dataType)) { (l: Literal) => + cmpInterpretWithCodegen(EmptyRow, c(l)) + } + } + + /** + * Test evaluation results between Interpreted mode and Codegen mode, making sure we have + * consistent result regardless of the evaluation method we use. + * + * This method test against binary expressions by feeding them arbitrary literals of `dataType1` + * and `dataType2`. + */ + def checkConsistencyBetweenInterpretedAndCodegen( + c: (Expression, Expression) => Expression, + dataType1: DataType, + dataType2: DataType): Unit = { + forAll ( + LiteralGenerator.randomGen(dataType1), + LiteralGenerator.randomGen(dataType2) + ) { (l1: Literal, l2: Literal) => + cmpInterpretWithCodegen(EmptyRow, c(l1, l2)) + } + } + + /** + * Test evaluation results between Interpreted mode and Codegen mode, making sure we have + * consistent result regardless of the evaluation method we use. + * + * This method test against ternary expressions by feeding them arbitrary literals of `dataType1`, + * `dataType2` and `dataType3`. + */ + def checkConsistencyBetweenInterpretedAndCodegen( + c: (Expression, Expression, Expression) => Expression, + dataType1: DataType, + dataType2: DataType, + dataType3: DataType): Unit = { + forAll ( + LiteralGenerator.randomGen(dataType1), + LiteralGenerator.randomGen(dataType2), + LiteralGenerator.randomGen(dataType3) + ) { (l1: Literal, l2: Literal, l3: Literal) => + cmpInterpretWithCodegen(EmptyRow, c(l1, l2, l3)) + } + } + + /** + * Test evaluation results between Interpreted mode and Codegen mode, making sure we have + * consistent result regardless of the evaluation method we use. + * + * This method test against expressions take Seq[Expression] as input by feeding them + * arbitrary length Seq of arbitrary literal of `dataType`. + */ + def checkConsistencyBetweenInterpretedAndCodegen( + c: Seq[Expression] => Expression, + dataType: DataType, + minNumElements: Int = 0): Unit = { + forAll (Gen.listOf(LiteralGenerator.randomGen(dataType))) { (literals: Seq[Literal]) => + whenever(literals.size >= minNumElements) { + cmpInterpretWithCodegen(EmptyRow, c(literals)) + } + } + } + + private def cmpInterpretWithCodegen(inputRow: InternalRow, expr: Expression): Unit = { + val interpret = try { + evaluate(expr, inputRow) + } catch { + case e: Exception => fail(s"Exception evaluating $expr", e) + } + + val plan = generateProject( + GenerateMutableProjection.generate(Alias(expr, s"Optimized($expr)")() :: Nil)(), + expr) + val codegen = plan(inputRow).get(0, expr.dataType) + + if (!compareResults(interpret, codegen)) { + fail(s"Incorrect evaluation: $expr, interpret: $interpret, codegen: $codegen") + } + } + + /** + * Check the equality between result of expression and expected value, it will handle + * Array[Byte] and Spread[Double]. + */ + private[this] def compareResults(result: Any, expected: Any): Boolean = { + (result, expected) match { + case (result: Array[Byte], expected: Array[Byte]) => + java.util.Arrays.equals(result, expected) + case (result: Double, expected: Spread[Double]) => + expected.isWithin(result) + case (result: Double, expected: Double) if result.isNaN && expected.isNaN => + true + case (result: Float, expected: Float) if result.isNaN && expected.isNaN => + true + case _ => result == expected + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntegralLiteralTestUtils.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntegralLiteralTestUtils.scala new file mode 100644 index 000000000000..2e5a121f4ec5 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntegralLiteralTestUtils.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +/** + * Utilities to make sure we pass the proper numeric ranges + */ +object IntegralLiteralTestUtils { + + val positiveShort: Short = (Byte.MaxValue + 1).toShort + val negativeShort: Short = (Byte.MinValue - 1).toShort + + val positiveShortLit: Literal = Literal(positiveShort) + val negativeShortLit: Literal = Literal(negativeShort) + + val positiveInt: Int = Short.MaxValue + 1 + val negativeInt: Int = Short.MinValue - 1 + + val positiveIntLit: Literal = Literal(positiveInt) + val negativeIntLit: Literal = Literal(negativeInt) + + val positiveLong: Long = Int.MaxValue + 1L + val negativeLong: Long = Int.MinValue - 1L + + val positiveLongLit: Literal = Literal(positiveLong) + val negativeLongLit: Literal = Literal(negativeLong) +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala new file mode 100644 index 000000000000..4addbaf0cbce --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.SparkFunSuite + +class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { + val json = + """ + |{"store":{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}], + |"basket":[[1,2,{"b":"y","a":"x"}],[3,4],[5,6]],"book":[{"author":"Nigel Rees", + |"title":"Sayings of the Century","category":"reference","price":8.95}, + |{"author":"Herman Melville","title":"Moby Dick","category":"fiction","price":8.99, + |"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title":"The Lord of the Rings", + |"category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}], + |"price":22.99,"isbn":"0-395-19395-8"}],"bicycle":{"price":19.95,"color":"red"}}, + |"email":"amy@only_for_json_udf_test.net","owner":"amy","zip code":"94025", + |"fb:testid":"1234"} + |""".stripMargin + + test("$.store.bicycle") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.bicycle")), + """{"price":19.95,"color":"red"}""") + } + + test("$.store.book") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.book")), + """[{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference", + |"price":8.95},{"author":"Herman Melville","title":"Moby Dick","category":"fiction", + |"price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title": + |"The Lord of the Rings","category":"fiction","reader":[{"age":25,"name":"bob"}, + |{"age":26,"name":"jack"}],"price":22.99,"isbn":"0-395-19395-8"}] + |""".stripMargin.replace("\n", "")) + } + + test("$.store.book[0]") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.book[0]")), + """{"author":"Nigel Rees","title":"Sayings of the Century", + |"category":"reference","price":8.95}""".stripMargin.replace("\n", "")) + } + + test("$.store.book[*]") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.book[*]")), + """[{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference", + |"price":8.95},{"author":"Herman Melville","title":"Moby Dick","category":"fiction", + |"price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title": + |"The Lord of the Rings","category":"fiction","reader":[{"age":25,"name":"bob"}, + |{"age":26,"name":"jack"}],"price":22.99,"isbn":"0-395-19395-8"}] + |""".stripMargin.replace("\n", "")) + } + + test("$") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$")), + json.replace("\n", "")) + } + + test("$.store.book[0].category") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.book[0].category")), + "reference") + } + + test("$.store.book[*].category") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.book[*].category")), + """["reference","fiction","fiction"]""") + } + + test("$.store.book[*].isbn") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.book[*].isbn")), + """["0-553-21311-3","0-395-19395-8"]""") + } + + test("$.store.book[*].reader") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.book[*].reader")), + """[{"age":25,"name":"bob"},{"age":26,"name":"jack"}]""") + } + + test("$.store.basket[0][1]") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.basket[0][1]")), + "2") + } + + test("$.store.basket[*]") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.basket[*]")), + """[[1,2,{"b":"y","a":"x"}],[3,4],[5,6]]""") + } + + test("$.store.basket[*][0]") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.basket[*][0]")), + "[1,3,5]") + } + + test("$.store.basket[0][*]") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.basket[0][*]")), + """[1,2,{"b":"y","a":"x"}]""") + } + + test("$.store.basket[*][*]") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.basket[*][*]")), + """[1,2,{"b":"y","a":"x"},3,4,5,6]""") + } + + test("$.store.basket[0][2].b") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.basket[0][2].b")), + "y") + } + + test("$.store.basket[0][*].b") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.basket[0][*].b")), + """["y"]""") + } + + test("$.zip code") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.zip code")), + "94025") + } + + test("$.fb:testid") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.fb:testid")), + "1234") + } + + test("preserve newlines") { + checkEvaluation( + GetJsonObject(Literal("""{"a":"b\nc"}"""), Literal("$.a")), + "b\nc") + } + + test("escape") { + checkEvaluation( + GetJsonObject(Literal("""{"a":"b\"c"}"""), Literal("$.a")), + "b\"c") + } + + test("$.non_exist_key") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.non_exist_key")), + null) + } + + test("$..no_recursive") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$..no_recursive")), + null) + } + + test("$.store.book[10]") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.book[10]")), + null) + } + + test("$.store.book[0].non_exist_key") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.book[0].non_exist_key")), + null) + } + + test("$.store.basket[*].non_exist_key") { + checkEvaluation( + GetJsonObject(Literal(json), Literal("$.store.basket[*].non_exist_key")), + null) + } + + test("non foldable literal") { + checkEvaluation( + GetJsonObject(NonFoldableLiteral(json), NonFoldableLiteral("$.fb:testid")), + "1234") + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index f6404d21611e..015eb1897fb8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -83,12 +83,14 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { } test("decimal") { - List(0.0, 1.2, 1.1111, 5).foreach { d => + List(-0.0001, 0.0, 0.001, 1.2, 1.1111, 5).foreach { d => checkEvaluation(Literal(Decimal(d)), Decimal(d)) checkEvaluation(Literal(Decimal(d.toInt)), Decimal(d.toInt)) checkEvaluation(Literal(Decimal(d.toLong)), Decimal(d.toLong)) - checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 1)), - Decimal((d * 1000L).toLong, 10, 1)) + checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 3)), + Decimal((d * 1000L).toLong, 10, 3)) + checkEvaluation(Literal(BigDecimal(d.toString)), Decimal(d)) + checkEvaluation(Literal(new java.math.BigDecimal(d.toString)), Decimal(d)) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala new file mode 100644 index 000000000000..ee6d25157fc0 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import java.sql.{Date, Timestamp} + +import org.scalacheck.{Arbitrary, Gen} +import org.scalatest.Matchers +import org.scalatest.prop.GeneratorDrivenPropertyChecks + +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval + +/** + * Property is a high-level specification of behavior that should hold for a range of data points. + * + * For example, while we are evaluating a deterministic expression for some input, we should always + * hold the property that the result never changes, regardless of how we get the result, + * via interpreted or codegen. + * + * In ScalaTest, properties are specified as functions and the data points used to check properties + * can be supplied by either tables or generators. + * + * Generator-driven property checks are performed via integration with ScalaCheck. + * + * @example {{{ + * def toTest(i: Int): Boolean = if (i % 2 == 0) true else false + * + * import org.scalacheck.Gen + * + * test ("true if param is even") { + * val evenInts = for (n <- Gen.choose(-1000, 1000)) yield 2 * n + * forAll(evenInts) { (i: Int) => + * assert (toTest(i) === true) + * } + * } + * }}} + * + */ +object LiteralGenerator { + + lazy val byteLiteralGen: Gen[Literal] = + for { b <- Arbitrary.arbByte.arbitrary } yield Literal.create(b, ByteType) + + lazy val shortLiteralGen: Gen[Literal] = + for { s <- Arbitrary.arbShort.arbitrary } yield Literal.create(s, ShortType) + + lazy val integerLiteralGen: Gen[Literal] = + for { i <- Arbitrary.arbInt.arbitrary } yield Literal.create(i, IntegerType) + + lazy val longLiteralGen: Gen[Literal] = + for { l <- Arbitrary.arbLong.arbitrary } yield Literal.create(l, LongType) + + lazy val floatLiteralGen: Gen[Literal] = + for { + f <- Gen.chooseNum(Float.MinValue / 2, Float.MaxValue / 2, + Float.NaN, Float.PositiveInfinity, Float.NegativeInfinity) + } yield Literal.create(f, FloatType) + + lazy val doubleLiteralGen: Gen[Literal] = + for { + f <- Gen.chooseNum(Double.MinValue / 2, Double.MaxValue / 2, + Double.NaN, Double.PositiveInfinity, Double.NegativeInfinity) + } yield Literal.create(f, DoubleType) + + // TODO: decimal type + + lazy val stringLiteralGen: Gen[Literal] = + for { s <- Arbitrary.arbString.arbitrary } yield Literal.create(s, StringType) + + lazy val binaryLiteralGen: Gen[Literal] = + for { ab <- Gen.listOf[Byte](Arbitrary.arbByte.arbitrary) } + yield Literal.create(ab.toArray, BinaryType) + + lazy val booleanLiteralGen: Gen[Literal] = + for { b <- Arbitrary.arbBool.arbitrary } yield Literal.create(b, BooleanType) + + lazy val dateLiteralGen: Gen[Literal] = + for { d <- Arbitrary.arbInt.arbitrary } yield Literal.create(new Date(d), DateType) + + lazy val timestampLiteralGen: Gen[Literal] = + for { t <- Arbitrary.arbLong.arbitrary } yield Literal.create(new Timestamp(t), TimestampType) + + lazy val calendarIntervalLiterGen: Gen[Literal] = + for { m <- Arbitrary.arbInt.arbitrary; s <- Arbitrary.arbLong.arbitrary} + yield Literal.create(new CalendarInterval(m, s), CalendarIntervalType) + + + // Sometimes, it would be quite expensive when unlimited value is used, + // for example, the `times` arguments for StringRepeat would hang the test 'forever' + // if it's tested against Int.MaxValue by ScalaCheck, therefore, use values from a limited + // range is more reasonable + lazy val limitedIntegerLiteralGen: Gen[Literal] = + for { i <- Gen.choose(-100, 100) } yield Literal.create(i, IntegerType) + + def randomGen(dt: DataType): Gen[Literal] = { + dt match { + case ByteType => byteLiteralGen + case ShortType => shortLiteralGen + case IntegerType => integerLiteralGen + case LongType => longLiteralGen + case DoubleType => doubleLiteralGen + case FloatType => floatLiteralGen + case DateType => dateLiteralGen + case TimestampType => timestampLiteralGen + case BooleanType => booleanLiteralGen + case StringType => stringLiteralGen + case BinaryType => binaryLiteralGen + case CalendarIntervalType => calendarIntervalLiterGen + case dt => throw new IllegalArgumentException(s"not supported type $dt") + } + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala index 649a5b44dc03..90c59f240b54 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala @@ -23,12 +23,14 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateProjection, GenerateMutableProjection} import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project} import org.apache.spark.sql.types._ - class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { + import IntegralLiteralTestUtils._ + /** * Used for testing leaf math expressions. * @@ -148,7 +150,7 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { expression: Expression, inputRow: InternalRow = EmptyRow): Unit = { val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation) - val optimizedPlan = defaultOptimizer.execute(plan) + val optimizedPlan = DefaultOptimizer.execute(plan) checkNaNWithoutCodegen(optimizedPlan.expressions.head, inputRow) } @@ -181,60 +183,74 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("sin") { testUnary(Sin, math.sin) + checkConsistencyBetweenInterpretedAndCodegen(Sin, DoubleType) } test("asin") { testUnary(Asin, math.asin, (-10 to 10).map(_ * 0.1)) testUnary(Asin, math.asin, (11 to 20).map(_ * 0.1), expectNaN = true) + checkConsistencyBetweenInterpretedAndCodegen(Asin, DoubleType) } test("sinh") { testUnary(Sinh, math.sinh) + checkConsistencyBetweenInterpretedAndCodegen(Sinh, DoubleType) } test("cos") { testUnary(Cos, math.cos) + checkConsistencyBetweenInterpretedAndCodegen(Cos, DoubleType) } test("acos") { testUnary(Acos, math.acos, (-10 to 10).map(_ * 0.1)) testUnary(Acos, math.acos, (11 to 20).map(_ * 0.1), expectNaN = true) + checkConsistencyBetweenInterpretedAndCodegen(Acos, DoubleType) } test("cosh") { testUnary(Cosh, math.cosh) + checkConsistencyBetweenInterpretedAndCodegen(Cosh, DoubleType) } test("tan") { testUnary(Tan, math.tan) + checkConsistencyBetweenInterpretedAndCodegen(Tan, DoubleType) } test("atan") { testUnary(Atan, math.atan) + checkConsistencyBetweenInterpretedAndCodegen(Atan, DoubleType) } test("tanh") { testUnary(Tanh, math.tanh) + checkConsistencyBetweenInterpretedAndCodegen(Tanh, DoubleType) } test("toDegrees") { testUnary(ToDegrees, math.toDegrees) + checkConsistencyBetweenInterpretedAndCodegen(Acos, DoubleType) } test("toRadians") { testUnary(ToRadians, math.toRadians) + checkConsistencyBetweenInterpretedAndCodegen(ToRadians, DoubleType) } test("cbrt") { testUnary(Cbrt, math.cbrt) + checkConsistencyBetweenInterpretedAndCodegen(Cbrt, DoubleType) } test("ceil") { testUnary(Ceil, math.ceil) + checkConsistencyBetweenInterpretedAndCodegen(Ceil, DoubleType) } test("floor") { testUnary(Floor, math.floor) + checkConsistencyBetweenInterpretedAndCodegen(Floor, DoubleType) } test("factorial") { @@ -244,37 +260,45 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Literal.create(null, IntegerType), null, create_row(null)) checkEvaluation(Factorial(Literal(20)), 2432902008176640000L, EmptyRow) checkEvaluation(Factorial(Literal(21)), null, EmptyRow) + checkConsistencyBetweenInterpretedAndCodegen(Factorial.apply _, IntegerType) } test("rint") { testUnary(Rint, math.rint) + checkConsistencyBetweenInterpretedAndCodegen(Rint, DoubleType) } test("exp") { testUnary(Exp, math.exp) + checkConsistencyBetweenInterpretedAndCodegen(Exp, DoubleType) } test("expm1") { testUnary(Expm1, math.expm1) + checkConsistencyBetweenInterpretedAndCodegen(Expm1, DoubleType) } test("signum") { testUnary[Double, Double](Signum, math.signum) + checkConsistencyBetweenInterpretedAndCodegen(Signum, DoubleType) } test("log") { testUnary(Log, math.log, (1 to 20).map(_ * 0.1)) testUnary(Log, math.log, (-5 to 0).map(_ * 0.1), expectNull = true) + checkConsistencyBetweenInterpretedAndCodegen(Log, DoubleType) } test("log10") { testUnary(Log10, math.log10, (1 to 20).map(_ * 0.1)) testUnary(Log10, math.log10, (-5 to 0).map(_ * 0.1), expectNull = true) + checkConsistencyBetweenInterpretedAndCodegen(Log10, DoubleType) } test("log1p") { testUnary(Log1p, math.log1p, (0 to 20).map(_ * 0.1)) testUnary(Log1p, math.log1p, (-10 to -1).map(_ * 1.0), expectNull = true) + checkConsistencyBetweenInterpretedAndCodegen(Log1p, DoubleType) } test("bin") { @@ -292,12 +316,18 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Bin(l3), java.lang.Long.toBinaryString(123), row) checkEvaluation(Bin(l4), java.lang.Long.toBinaryString(1234), row) checkEvaluation(Bin(l5), java.lang.Long.toBinaryString(-123), row) + + checkEvaluation(Bin(positiveLongLit), java.lang.Long.toBinaryString(positiveLong)) + checkEvaluation(Bin(negativeLongLit), java.lang.Long.toBinaryString(negativeLong)) + + checkConsistencyBetweenInterpretedAndCodegen(Bin, LongType) } test("log2") { def f: (Double) => Double = (x: Double) => math.log(x) / math.log(2) testUnary(Log2, f, (1 to 20).map(_ * 0.1)) testUnary(Log2, f, (-5 to 0).map(_ * 1.0), expectNull = true) + checkConsistencyBetweenInterpretedAndCodegen(Log2, DoubleType) } test("sqrt") { @@ -307,11 +337,13 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Sqrt(Literal.create(null, DoubleType)), null, create_row(null)) checkNaN(Sqrt(Literal(-1.0)), EmptyRow) checkNaN(Sqrt(Literal(-1.5)), EmptyRow) + checkConsistencyBetweenInterpretedAndCodegen(Sqrt, DoubleType) } test("pow") { testBinary(Pow, math.pow, (-5 to 5).map(v => (v * 1.0, v * 1.0))) testBinary(Pow, math.pow, Seq((-1.0, 0.9), (-2.2, 1.7), (-2.2, -1.7)), expectNaN = true) + checkConsistencyBetweenInterpretedAndCodegen(Pow, DoubleType, DoubleType) } test("shift left") { @@ -323,6 +355,18 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(ShiftLeft(Literal(21.toLong), Literal(1)), 42.toLong) checkEvaluation(ShiftLeft(Literal(-21.toLong), Literal(1)), -42.toLong) + + checkEvaluation(ShiftLeft(positiveIntLit, positiveIntLit), positiveInt << positiveInt) + checkEvaluation(ShiftLeft(positiveIntLit, negativeIntLit), positiveInt << negativeInt) + checkEvaluation(ShiftLeft(negativeIntLit, positiveIntLit), negativeInt << positiveInt) + checkEvaluation(ShiftLeft(negativeIntLit, negativeIntLit), negativeInt << negativeInt) + checkEvaluation(ShiftLeft(positiveLongLit, positiveIntLit), positiveLong << positiveInt) + checkEvaluation(ShiftLeft(positiveLongLit, negativeIntLit), positiveLong << negativeInt) + checkEvaluation(ShiftLeft(negativeLongLit, positiveIntLit), negativeLong << positiveInt) + checkEvaluation(ShiftLeft(negativeLongLit, negativeIntLit), negativeLong << negativeInt) + + checkConsistencyBetweenInterpretedAndCodegen(ShiftLeft, IntegerType, IntegerType) + checkConsistencyBetweenInterpretedAndCodegen(ShiftLeft, LongType, IntegerType) } test("shift right") { @@ -334,6 +378,18 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(ShiftRight(Literal(42.toLong), Literal(1)), 21.toLong) checkEvaluation(ShiftRight(Literal(-42.toLong), Literal(1)), -21.toLong) + + checkEvaluation(ShiftRight(positiveIntLit, positiveIntLit), positiveInt >> positiveInt) + checkEvaluation(ShiftRight(positiveIntLit, negativeIntLit), positiveInt >> negativeInt) + checkEvaluation(ShiftRight(negativeIntLit, positiveIntLit), negativeInt >> positiveInt) + checkEvaluation(ShiftRight(negativeIntLit, negativeIntLit), negativeInt >> negativeInt) + checkEvaluation(ShiftRight(positiveLongLit, positiveIntLit), positiveLong >> positiveInt) + checkEvaluation(ShiftRight(positiveLongLit, negativeIntLit), positiveLong >> negativeInt) + checkEvaluation(ShiftRight(negativeLongLit, positiveIntLit), negativeLong >> positiveInt) + checkEvaluation(ShiftRight(negativeLongLit, negativeIntLit), negativeLong >> negativeInt) + + checkConsistencyBetweenInterpretedAndCodegen(ShiftRight, IntegerType, IntegerType) + checkConsistencyBetweenInterpretedAndCodegen(ShiftRight, LongType, IntegerType) } test("shift right unsigned") { @@ -345,6 +401,26 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(ShiftRightUnsigned(Literal(42.toLong), Literal(1)), 21.toLong) checkEvaluation(ShiftRightUnsigned(Literal(-42.toLong), Literal(1)), 9223372036854775787L) + + checkEvaluation(ShiftRightUnsigned(positiveIntLit, positiveIntLit), + positiveInt >>> positiveInt) + checkEvaluation(ShiftRightUnsigned(positiveIntLit, negativeIntLit), + positiveInt >>> negativeInt) + checkEvaluation(ShiftRightUnsigned(negativeIntLit, positiveIntLit), + negativeInt >>> positiveInt) + checkEvaluation(ShiftRightUnsigned(negativeIntLit, negativeIntLit), + negativeInt >>> negativeInt) + checkEvaluation(ShiftRightUnsigned(positiveLongLit, positiveIntLit), + positiveLong >>> positiveInt) + checkEvaluation(ShiftRightUnsigned(positiveLongLit, negativeIntLit), + positiveLong >>> negativeInt) + checkEvaluation(ShiftRightUnsigned(negativeLongLit, positiveIntLit), + negativeLong >>> positiveInt) + checkEvaluation(ShiftRightUnsigned(negativeLongLit, negativeIntLit), + negativeLong >>> negativeInt) + + checkConsistencyBetweenInterpretedAndCodegen(ShiftRightUnsigned, IntegerType, IntegerType) + checkConsistencyBetweenInterpretedAndCodegen(ShiftRightUnsigned, LongType, IntegerType) } test("hex") { @@ -359,6 +435,9 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { // Turn off scala style for non-ascii chars checkEvaluation(Hex(Literal("三重的".getBytes("UTF8"))), "E4B889E9878DE79A84") // scalastyle:on + Seq(LongType, BinaryType, StringType).foreach { dt => + checkConsistencyBetweenInterpretedAndCodegen(Hex.apply _, dt) + } } test("unhex") { @@ -372,16 +451,18 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { // Turn off scala style for non-ascii chars checkEvaluation(Unhex(Literal("E4B889E9878DE79A84")), "三重的".getBytes("UTF-8")) checkEvaluation(Unhex(Literal("三重的")), null) - // scalastyle:on + checkConsistencyBetweenInterpretedAndCodegen(Unhex, StringType) } test("hypot") { testBinary(Hypot, math.hypot) + checkConsistencyBetweenInterpretedAndCodegen(Hypot, DoubleType, DoubleType) } test("atan2") { testBinary(Atan2, math.atan2) + checkConsistencyBetweenInterpretedAndCodegen(Atan2, DoubleType, DoubleType) } test("binary log") { @@ -413,6 +494,7 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { Logarithm(Literal(1.0), Literal(-1.0)), null, create_row(null)) + checkConsistencyBetweenInterpretedAndCodegen(Logarithm, DoubleType, DoubleType) } test("round") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala index b524d0af14a6..75d17417e5a0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala @@ -29,6 +29,7 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Md5(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), "6ac1e56bc78f031059be7be854522c4c") checkEvaluation(Md5(Literal.create(null, BinaryType)), null) + checkConsistencyBetweenInterpretedAndCodegen(Md5, BinaryType) } test("sha1") { @@ -37,6 +38,7 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { "5d211bad8f4ee70e16c7d343a838fc344a1ed961") checkEvaluation(Sha1(Literal.create(null, BinaryType)), null) checkEvaluation(Sha1(Literal("".getBytes)), "da39a3ee5e6b4b0d3255bfef95601890afd80709") + checkConsistencyBetweenInterpretedAndCodegen(Sha1, BinaryType) } test("sha2") { @@ -55,6 +57,6 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Crc32(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), 2180413220L) checkEvaluation(Crc32(Literal.create(null, BinaryType)), null) + checkConsistencyBetweenInterpretedAndCodegen(Crc32, BinaryType) } - } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala index bf197124d8db..ace6c15dc841 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala @@ -77,7 +77,7 @@ class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { } } - test("AtLeastNNonNullNans") { + test("AtLeastNNonNulls") { val mix = Seq(Literal("x"), Literal.create(null, StringType), Literal.create(null, DoubleType), @@ -96,46 +96,11 @@ class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { Literal(Float.MaxValue), Literal(false)) - checkEvaluation(AtLeastNNonNullNans(0, mix), true, EmptyRow) - checkEvaluation(AtLeastNNonNullNans(2, mix), true, EmptyRow) - checkEvaluation(AtLeastNNonNullNans(3, mix), false, EmptyRow) - checkEvaluation(AtLeastNNonNullNans(0, nanOnly), true, EmptyRow) - checkEvaluation(AtLeastNNonNullNans(3, nanOnly), true, EmptyRow) - checkEvaluation(AtLeastNNonNullNans(4, nanOnly), false, EmptyRow) - checkEvaluation(AtLeastNNonNullNans(0, nullOnly), true, EmptyRow) - checkEvaluation(AtLeastNNonNullNans(3, nullOnly), true, EmptyRow) - checkEvaluation(AtLeastNNonNullNans(4, nullOnly), false, EmptyRow) - } - - test("AtLeastNNull") { - val mix = Seq(Literal("x"), - Literal.create(null, StringType), - Literal.create(null, DoubleType), - Literal(Double.NaN), - Literal(5f)) - - val nanOnly = Seq(Literal("x"), - Literal(10.0), - Literal(Float.NaN), - Literal(math.log(-2)), - Literal(Double.MaxValue)) - - val nullOnly = Seq(Literal("x"), - Literal.create(null, DoubleType), - Literal.create(null, DecimalType.USER_DEFAULT), - Literal(Float.MaxValue), - Literal(false)) - - checkEvaluation(AtLeastNNulls(0, mix), true, EmptyRow) - checkEvaluation(AtLeastNNulls(1, mix), true, EmptyRow) - checkEvaluation(AtLeastNNulls(2, mix), true, EmptyRow) - checkEvaluation(AtLeastNNulls(3, mix), false, EmptyRow) - checkEvaluation(AtLeastNNulls(0, nanOnly), true, EmptyRow) - checkEvaluation(AtLeastNNulls(1, nanOnly), false, EmptyRow) - checkEvaluation(AtLeastNNulls(2, nanOnly), false, EmptyRow) - checkEvaluation(AtLeastNNulls(0, nullOnly), true, EmptyRow) - checkEvaluation(AtLeastNNulls(1, nullOnly), true, EmptyRow) - checkEvaluation(AtLeastNNulls(2, nullOnly), true, EmptyRow) - checkEvaluation(AtLeastNNulls(3, nullOnly), false, EmptyRow) + checkEvaluation(AtLeastNNonNulls(2, mix), true, EmptyRow) + checkEvaluation(AtLeastNNonNulls(3, mix), false, EmptyRow) + checkEvaluation(AtLeastNNonNulls(3, nanOnly), true, EmptyRow) + checkEvaluation(AtLeastNNonNulls(4, nanOnly), false, EmptyRow) + checkEvaluation(AtLeastNNonNulls(3, nullOnly), true, EmptyRow) + checkEvaluation(AtLeastNNonNulls(4, nullOnly), false, EmptyRow) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala index d7eb13c50b13..03e7611fce8f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala @@ -20,8 +20,8 @@ package org.apache.spark.sql.catalyst.expressions import scala.collection.immutable.HashSet import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.types.{Decimal, DoubleType, IntegerType, BooleanType} +import org.apache.spark.sql.RandomDataGenerator +import org.apache.spark.sql.types._ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper { @@ -72,6 +72,16 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper { notTrueTable.foreach { case (v, answer) => checkEvaluation(Not(Literal.create(v, BooleanType)), answer) } + checkConsistencyBetweenInterpretedAndCodegen(Not, BooleanType) + } + + test("AND, OR, EqualTo, EqualNullSafe consistency check") { + checkConsistencyBetweenInterpretedAndCodegen(And, BooleanType, BooleanType) + checkConsistencyBetweenInterpretedAndCodegen(Or, BooleanType, BooleanType) + DataTypeTestUtils.propertyCheckSupported.foreach { dt => + checkConsistencyBetweenInterpretedAndCodegen(EqualTo, dt, dt) + checkConsistencyBetweenInterpretedAndCodegen(EqualNullSafe, dt, dt) + } } booleanLogicTest("AND", And, @@ -108,6 +118,12 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper { (null, null, null) :: Nil) test("IN") { + checkEvaluation(In(Literal.create(null, IntegerType), Seq(Literal(1), Literal(2))), null) + checkEvaluation(In(Literal.create(null, IntegerType), Seq(Literal.create(null, IntegerType))), + null) + checkEvaluation(In(Literal(1), Seq(Literal.create(null, IntegerType))), null) + checkEvaluation(In(Literal(1), Seq(Literal(1), Literal.create(null, IntegerType))), true) + checkEvaluation(In(Literal(2), Seq(Literal(1), Literal.create(null, IntegerType))), null) checkEvaluation(In(Literal(1), Seq(Literal(1), Literal(2))), true) checkEvaluation(In(Literal(2), Seq(Literal(1), Literal(2))), true) checkEvaluation(In(Literal(3), Seq(Literal(1), Literal(2))), false) @@ -115,9 +131,38 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper { And(In(Literal(1), Seq(Literal(1), Literal(2))), In(Literal(2), Seq(Literal(1), Literal(2)))), true) - checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("^Ba*n"))), true) + val ns = Literal.create(null, StringType) + checkEvaluation(In(ns, Seq(Literal("1"), Literal("2"))), null) + checkEvaluation(In(ns, Seq(ns)), null) + checkEvaluation(In(Literal("a"), Seq(ns)), null) + checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("^Ba*n"), ns)), true) checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("aa"), Literal("^Ba*n"))), true) checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("aa"), Literal("^n"))), false) + + val primitiveTypes = Seq(IntegerType, FloatType, DoubleType, StringType, ByteType, ShortType, + LongType, BinaryType, BooleanType, DecimalType.USER_DEFAULT, TimestampType) + primitiveTypes.map { t => + val dataGen = RandomDataGenerator.forType(t, nullable = true).get + val inputData = Seq.fill(10) { + val value = dataGen.apply() + value match { + case d: Double if d.isNaN => 0.0d + case f: Float if f.isNaN => 0.0f + case _ => value + } + } + val input = inputData.map(Literal.create(_, t)) + val expected = if (inputData(0) == null) { + null + } else if (inputData.slice(1, 10).contains(inputData(0))) { + true + } else if (inputData.slice(1, 10).contains(null)) { + null + } else { + false + } + checkEvaluation(In(input(0), input.slice(1, 10)), expected) + } } test("INSET") { @@ -130,10 +175,35 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(InSet(one, hS), true) checkEvaluation(InSet(two, hS), true) checkEvaluation(InSet(two, nS), true) - checkEvaluation(InSet(nl, nS), true) checkEvaluation(InSet(three, hS), false) - checkEvaluation(InSet(three, nS), false) - checkEvaluation(And(InSet(one, hS), InSet(two, hS)), true) + checkEvaluation(InSet(three, nS), null) + checkEvaluation(InSet(nl, hS), null) + checkEvaluation(InSet(nl, nS), null) + + val primitiveTypes = Seq(IntegerType, FloatType, DoubleType, StringType, ByteType, ShortType, + LongType, BinaryType, BooleanType, DecimalType.USER_DEFAULT, TimestampType) + primitiveTypes.map { t => + val dataGen = RandomDataGenerator.forType(t, nullable = true).get + val inputData = Seq.fill(10) { + val value = dataGen.apply() + value match { + case d: Double if d.isNaN => 0.0d + case f: Float if f.isNaN => 0.0f + case _ => value + } + } + val input = inputData.map(Literal(_)) + val expected = if (inputData(0) == null) { + null + } else if (inputData.slice(1, 10).contains(inputData(0))) { + true + } else if (inputData.slice(1, 10).contains(null)) { + null + } else { + false + } + checkEvaluation(InSet(input(0), inputData.slice(1, 10).toSet), expected) + } } private val smallValues = Seq(1, Decimal(1), Array(1.toByte), "a", 0f, 0d, false).map(Literal(_)) @@ -145,6 +215,15 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper { private val equalValues2 = Seq(1, Decimal(1), Array(1.toByte), "a", Float.NaN, Double.NaN, true).map(Literal(_)) + test("BinaryComparison consistency check") { + DataTypeTestUtils.ordered.foreach { dt => + checkConsistencyBetweenInterpretedAndCodegen(LessThan, dt, dt) + checkConsistencyBetweenInterpretedAndCodegen(LessThanOrEqual, dt, dt) + checkConsistencyBetweenInterpretedAndCodegen(GreaterThan, dt, dt) + checkConsistencyBetweenInterpretedAndCodegen(GreaterThanOrEqual, dt, dt) + } + } + test("BinaryComparison: lessThan") { for (i <- 0 until smallValues.length) { checkEvaluation(LessThan(smallValues(i), largeValues(i)), true) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 906be701beed..99e3b13ce8c9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -431,6 +431,20 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(SoundEx(Literal("!!")), "!!") } + test("translate") { + checkEvaluation( + StringTranslate(Literal("translate"), Literal("rnlt"), Literal("123")), "1a2s3ae") + checkEvaluation(StringTranslate(Literal("translate"), Literal(""), Literal("123")), "translate") + checkEvaluation(StringTranslate(Literal("translate"), Literal("rnlt"), Literal("")), "asae") + // test for multiple mapping + checkEvaluation(StringTranslate(Literal("abcd"), Literal("aba"), Literal("123")), "12cd") + checkEvaluation(StringTranslate(Literal("abcd"), Literal("aba"), Literal("12")), "12cd") + // scalastyle:off + // non ascii characters are not allowed in the source code, so we disable the scalastyle. + checkEvaluation(StringTranslate(Literal("花花世界"), Literal("花界"), Literal("ab")), "aa世b") + // scalastyle:on + } + test("TRIM/LTRIM/RTRIM") { val s = 'a.string.at(0) checkEvaluation(StringTrim(Literal(" aa ")), "aa", create_row(" abdef ")) @@ -659,7 +673,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Length(Literal.create(null, BinaryType)), null, create_row(bytes)) } - test("number format") { + test("format_number / FormatNumber") { checkEvaluation(FormatNumber(Literal(4.asInstanceOf[Byte]), Literal(3)), "4.000") checkEvaluation(FormatNumber(Literal(4.asInstanceOf[Short]), Literal(3)), "4.000") checkEvaluation(FormatNumber(Literal(4.0f), Literal(3)), "4.000") @@ -675,4 +689,14 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(FormatNumber(Literal.create(null, IntegerType), Literal(3)), null) checkEvaluation(FormatNumber(Literal.create(null, NullType), Literal(3)), null) } + + test("find in set") { + checkEvaluation( + FindInSet(Literal.create(null, StringType), Literal.create(null, StringType)), null) + checkEvaluation(FindInSet(Literal("ab"), Literal.create(null, StringType)), null) + checkEvaluation(FindInSet(Literal.create(null, StringType), Literal("abc,b,ab,c,def")), null) + checkEvaluation(FindInSet(Literal("ab"), Literal("abc,b,ab,c,def")), 3) + checkEvaluation(FindInSet(Literal("abf"), Literal("abc,b,ab,c,def")), 0) + checkEvaluation(FindInSet(Literal("ab,"), Literal("abc,b,ab,c,def")), 0) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala index 59491c5ba160..8c7220319363 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala @@ -123,7 +123,8 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers { DoubleType, StringType, BinaryType, - DecimalType.USER_DEFAULT + DecimalType.USER_DEFAULT, + DecimalType.SYSTEM_DEFAULT // ArrayType(IntegerType) ) val converter = UnsafeProjection.create(fieldTypes) @@ -151,6 +152,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers { assert(createdFromNull.getUTF8String(8) === null) assert(createdFromNull.getBinary(9) === null) assert(createdFromNull.getDecimal(10, 10, 0) === null) + assert(createdFromNull.getDecimal(11, 38, 18) === null) // assert(createdFromNull.get(11) === null) // If we have an UnsafeRow with columns that are initially non-null and we null out those @@ -169,6 +171,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers { r.update(8, UTF8String.fromString("hello")) r.update(9, "world".getBytes) r.setDecimal(10, Decimal(10), 10) + r.setDecimal(11, Decimal(10.00, 38, 18), 38) // r.update(11, Array(11)) r } @@ -187,10 +190,17 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers { assert(setToNullAfterCreation.getBinary(9) === rowWithNoNullColumns.getBinary(9)) assert(setToNullAfterCreation.getDecimal(10, 10, 0) === rowWithNoNullColumns.getDecimal(10, 10, 0)) + assert(setToNullAfterCreation.getDecimal(11, 38, 18) === + rowWithNoNullColumns.getDecimal(11, 38, 18)) // assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11)) for (i <- fieldTypes.indices) { - setToNullAfterCreation.setNullAt(i) + // Cann't call setNullAt() on DecimalType + if (i == 11) { + setToNullAfterCreation.setDecimal(11, null, 38) + } else { + setToNullAfterCreation.setNullAt(i) + } } // There are some garbage left in the var-length area assert(Arrays.equals(createdFromNull.getBytes, setToNullAfterCreation.getBytes())) @@ -206,6 +216,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers { // setToNullAfterCreation.update(8, UTF8String.fromString("hello")) // setToNullAfterCreation.update(9, "world".getBytes) setToNullAfterCreation.setDecimal(10, Decimal(10), 10) + setToNullAfterCreation.setDecimal(11, Decimal(10.00, 38, 18), 38) // setToNullAfterCreation.update(11, Array(11)) assert(setToNullAfterCreation.isNullAt(0) === rowWithNoNullColumns.isNullAt(0)) @@ -220,6 +231,8 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers { // assert(setToNullAfterCreation.get(9) === rowWithNoNullColumns.get(9)) assert(setToNullAfterCreation.getDecimal(10, 10, 0) === rowWithNoNullColumns.getDecimal(10, 10, 0)) + assert(setToNullAfterCreation.getDecimal(11, 38, 18) === + rowWithNoNullColumns.getDecimal(11, 38, 18)) // assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11)) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala index aff1bee99faa..796d60032e1a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala @@ -22,7 +22,7 @@ import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.PlatformDependent +import org.apache.spark.unsafe.Platform /** * A test suite for the bitset portion of the row concatenation. @@ -96,7 +96,7 @@ class GenerateUnsafeRowJoinerBitsetSuite extends SparkFunSuite { // This way we can test the joiner when the input UnsafeRows are not the entire arrays. val offset = numFields * 8 val buf = new Array[Byte](sizeInBytes + offset) - row.pointTo(buf, PlatformDependent.BYTE_ARRAY_OFFSET + offset, numFields, sizeInBytes) + row.pointTo(buf, Platform.BYTE_ARRAY_OFFSET + offset, numFields, sizeInBytes) row } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala new file mode 100644 index 000000000000..098944a9f4fc --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions.codegen + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + +/** + * A test suite for generated projections + */ +class GeneratedProjectionSuite extends SparkFunSuite { + + test("generated projections on wider table") { + val N = 1000 + val wideRow1 = new GenericInternalRow((1 to N).toArray[Any]) + val schema1 = StructType((1 to N).map(i => StructField("", IntegerType))) + val wideRow2 = new GenericInternalRow( + (1 to N).map(i => UTF8String.fromString(i.toString)).toArray[Any]) + val schema2 = StructType((1 to N).map(i => StructField("", StringType))) + val joined = new JoinedRow(wideRow1, wideRow2) + val joinedSchema = StructType(schema1 ++ schema2) + val nested = new JoinedRow(InternalRow(joined, joined), joined) + val nestedSchema = StructType( + Seq(StructField("", joinedSchema), StructField("", joinedSchema)) ++ joinedSchema) + + // test generated UnsafeProjection + val unsafeProj = UnsafeProjection.create(nestedSchema) + val unsafe: UnsafeRow = unsafeProj(nested) + (0 until N).foreach { i => + val s = UTF8String.fromString((i + 1).toString) + assert(i + 1 === unsafe.getInt(i + 2)) + assert(s === unsafe.getUTF8String(i + 2 + N)) + assert(i + 1 === unsafe.getStruct(0, N * 2).getInt(i)) + assert(s === unsafe.getStruct(0, N * 2).getUTF8String(i + N)) + assert(i + 1 === unsafe.getStruct(1, N * 2).getInt(i)) + assert(s === unsafe.getStruct(1, N * 2).getUTF8String(i + N)) + } + + // test generated SafeProjection + val safeProj = FromUnsafeProjection(nestedSchema) + val result = safeProj(unsafe) + // Can't compare GenericInternalRow with JoinedRow directly + (0 until N).foreach { i => + val r = i + 1 + val s = UTF8String.fromString((i + 1).toString) + assert(r === result.getInt(i + 2)) + assert(s === result.getUTF8String(i + 2 + N)) + assert(r === result.getStruct(0, N * 2).getInt(i)) + assert(s === result.getStruct(0, N * 2).getUTF8String(i + N)) + assert(r === result.getStruct(1, N * 2).getInt(i)) + assert(s === result.getStruct(1, N * 2).getUTF8String(i + N)) + } + + // test generated MutableProjection + val exprs = nestedSchema.fields.zipWithIndex.map { case (f, i) => + BoundReference(i, f.dataType, true) + } + val mutableProj = GenerateMutableProjection.generate(exprs)() + val row1 = mutableProj(result) + assert(result === row1) + val row2 = mutableProj(result) + assert(result === row2) + } + + test("generated unsafe projection with array of binary") { + val row = InternalRow( + Array[Byte](1, 2), + new GenericArrayData(Array(Array[Byte](1, 2), null, Array[Byte](3, 4)))) + val fields = (BinaryType :: ArrayType(BinaryType) :: Nil).toArray[DataType] + + val unsafeProj = UnsafeProjection.create(fields) + val unsafeRow: UnsafeRow = unsafeProj(row) + assert(java.util.Arrays.equals(unsafeRow.getBinary(0), Array[Byte](1, 2))) + assert(java.util.Arrays.equals(unsafeRow.getArray(1).getBinary(0), Array[Byte](1, 2))) + assert(unsafeRow.getArray(1).isNullAt(1)) + assert(unsafeRow.getArray(1).getBinary(1) === null) + assert(java.util.Arrays.equals(unsafeRow.getArray(1).getBinary(2), Array[Byte](3, 4))) + + val safeProj = FromUnsafeProjection(fields) + val row2 = safeProj(unsafeRow) + assert(row2 === row) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala index d4916ea8d273..1877cff1334b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.catalyst.optimizer -import org.apache.spark.sql.catalyst.analysis.{AnalysisSuite, EliminateSubQueries} +import org.apache.spark.sql.catalyst.SimpleCatalystConf +import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.PlanTest @@ -88,20 +89,24 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { ('a === 'b || 'b > 3 && 'a > 3 && 'a < 5)) } - private def caseInsensitiveAnalyse(plan: LogicalPlan) = - AnalysisSuite.caseInsensitiveAnalyzer.execute(plan) + private val caseInsensitiveAnalyzer = + new Analyzer(EmptyCatalog, EmptyFunctionRegistry, new SimpleCatalystConf(false)) test("(a && b) || (a && c) => a && (b || c) when case insensitive") { - val plan = caseInsensitiveAnalyse(testRelation.where(('a > 2 && 'b > 3) || ('A > 2 && 'b < 5))) + val plan = caseInsensitiveAnalyzer.execute( + testRelation.where(('a > 2 && 'b > 3) || ('A > 2 && 'b < 5))) val actual = Optimize.execute(plan) - val expected = caseInsensitiveAnalyse(testRelation.where('a > 2 && ('b > 3 || 'b < 5))) + val expected = caseInsensitiveAnalyzer.execute( + testRelation.where('a > 2 && ('b > 3 || 'b < 5))) comparePlans(actual, expected) } test("(a || b) && (a || c) => a || (b && c) when case insensitive") { - val plan = caseInsensitiveAnalyse(testRelation.where(('a > 2 || 'b > 3) && ('A > 2 || 'b < 5))) + val plan = caseInsensitiveAnalyzer.execute( + testRelation.where(('a > 2 || 'b > 3) && ('A > 2 || 'b < 5))) val actual = Optimize.execute(plan) - val expected = caseInsensitiveAnalyse(testRelation.where('a > 2 || ('b > 3 && 'b < 5))) + val expected = caseInsensitiveAnalyzer.execute( + testRelation.where('a > 2 || ('b > 3 && 'b < 5))) comparePlans(actual, expected) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala new file mode 100644 index 000000000000..dbebcb86809d --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.catalyst.expressions.Explode +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical.{Project, LocalRelation, Generate, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.RuleExecutor +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.types.StringType + +class ColumnPruningSuite extends PlanTest { + + object Optimize extends RuleExecutor[LogicalPlan] { + val batches = Batch("Column pruning", FixedPoint(100), + ColumnPruning) :: Nil + } + + test("Column pruning for Generate when Generate.join = false") { + val input = LocalRelation('a.int, 'b.array(StringType)) + + val query = Generate(Explode('b), false, false, None, 's.string :: Nil, input).analyze + val optimized = Optimize.execute(query) + + val correctAnswer = + Generate(Explode('b), false, false, None, 's.string :: Nil, + Project('b.attr :: Nil, input)).analyze + + comparePlans(optimized, correctAnswer) + } + + test("Column pruning for Generate when Generate.join = true") { + val input = LocalRelation('a.int, 'b.int, 'c.array(StringType)) + + val query = + Project(Seq('a, 's), + Generate(Explode('c), true, false, None, 's.string :: Nil, + input)).analyze + val optimized = Optimize.execute(query) + + val correctAnswer = + Project(Seq('a, 's), + Generate(Explode('c), true, false, None, 's.string :: Nil, + Project(Seq('a, 'c), + input))).analyze + + comparePlans(optimized, correctAnswer) + } + + test("Turn Generate.join to false if possible") { + val input = LocalRelation('b.array(StringType)) + + val query = + Project(('s + 1).as("s+1") :: Nil, + Generate(Explode('b), true, false, None, 's.string :: Nil, + input)).analyze + val optimized = Optimize.execute(query) + + val correctAnswer = + Project(('s + 1).as("s+1") :: Nil, + Generate(Explode('b), false, false, None, 's.string :: Nil, + input)).analyze + + comparePlans(optimized, correctAnswer) + } + + // todo: add more tests for column pruning +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala index ec3b2f1edfa0..e67606288f51 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala @@ -250,29 +250,14 @@ class ConstantFoldingSuite extends PlanTest { } test("Constant folding test: Fold In(v, list) into true or false") { - var originalQuery = + val originalQuery = testRelation .select('a) .where(In(Literal(1), Seq(Literal(1), Literal(2)))) - var optimized = Optimize.execute(originalQuery.analyze) - - var correctAnswer = - testRelation - .select('a) - .where(Literal(true)) - .analyze - - comparePlans(optimized, correctAnswer) - - originalQuery = - testRelation - .select('a) - .where(In(Literal(1), Seq(Literal(1), 'a.attr))) - - optimized = Optimize.execute(originalQuery.analyze) + val optimized = Optimize.execute(originalQuery.analyze) - correctAnswer = + val correctAnswer = testRelation .select('a) .where(Literal(true)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala index 1d433275fed2..6f7b5b9572e2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala @@ -43,16 +43,26 @@ class OptimizeInSuite extends PlanTest { val testRelation = LocalRelation('a.int, 'b.int, 'c.int) - test("OptimizedIn test: In clause optimized to InSet") { + test("OptimizedIn test: In clause not optimized to InSet when less than 10 items") { val originalQuery = testRelation .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2)))) .analyze + val optimized = Optimize.execute(originalQuery.analyze) + comparePlans(optimized, originalQuery) + } + + test("OptimizedIn test: In clause optimized to InSet when more than 10 items") { + val originalQuery = + testRelation + .where(In(UnresolvedAttribute("a"), (1 to 11).map(Literal(_)))) + .analyze + val optimized = Optimize.execute(originalQuery.analyze) val correctAnswer = testRelation - .where(InSet(UnresolvedAttribute("a"), HashSet[Any]() + 1 + 2)) + .where(InSet(UnresolvedAttribute("a"), (1 to 11).toSet)) .analyze comparePlans(optimized, correctAnswer) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala new file mode 100644 index 000000000000..455a3810c719 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.util._ + +/** + * This suite is used to test [[LogicalPlan]]'s `resolveOperators` and make sure it can correctly + * skips sub-trees that have already been marked as analyzed. + */ +class LogicalPlanSuite extends SparkFunSuite { + private var invocationCount = 0 + private val function: PartialFunction[LogicalPlan, LogicalPlan] = { + case p: Project => + invocationCount += 1 + p + } + + private val testRelation = LocalRelation() + + test("resolveOperator runs on operators") { + invocationCount = 0 + val plan = Project(Nil, testRelation) + plan resolveOperators function + + assert(invocationCount === 1) + } + + test("resolveOperator runs on operators recursively") { + invocationCount = 0 + val plan = Project(Nil, Project(Nil, testRelation)) + plan resolveOperators function + + assert(invocationCount === 2) + } + + test("resolveOperator skips all ready resolved plans") { + invocationCount = 0 + val plan = Project(Nil, Project(Nil, testRelation)) + plan.foreach(_.setAnalyzed()) + plan resolveOperators function + + assert(invocationCount === 0) + } + + test("resolveOperator skips partially resolved plans") { + invocationCount = 0 + val plan1 = Project(Nil, testRelation) + val plan2 = Project(Nil, plan1) + plan1.foreach(_.setAnalyzed()) + plan2 resolveOperators function + + assert(invocationCount === 1) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index d18fa4df1335..6b9a11f0ff74 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -49,13 +49,17 @@ class DateTimeUtilsSuite extends SparkFunSuite { test("us and julian day") { val (d, ns) = toJulianDay(0) assert(d === JULIAN_DAY_OF_EPOCH) - assert(ns === SECONDS_PER_DAY / 2 * NANOS_PER_SECOND) + assert(ns === 0) assert(fromJulianDay(d, ns) == 0L) - val t = new Timestamp(61394778610000L) // (2015, 6, 11, 10, 10, 10, 100) - val (d1, ns1) = toJulianDay(fromJavaTimestamp(t)) - val t2 = toJavaTimestamp(fromJulianDay(d1, ns1)) - assert(t.equals(t2)) + Seq(Timestamp.valueOf("2015-06-11 10:10:10.100"), + Timestamp.valueOf("2015-06-11 20:10:10.100"), + Timestamp.valueOf("1900-06-11 20:10:10.100")).foreach { t => + val (d, ns) = toJulianDay(fromJavaTimestamp(t)) + assert(ns > 0) + val t1 = toJavaTimestamp(fromJulianDay(d, ns)) + assert(t.equals(t1)) + } } test("SPARK-6785: java date conversion before and after epoch") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala index 88b221cd81d7..706ecd29d135 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala @@ -170,6 +170,30 @@ class DataTypeSuite extends SparkFunSuite { } } + test("existsRecursively") { + val struct = StructType( + StructField("a", LongType) :: + StructField("b", FloatType) :: Nil) + assert(struct.existsRecursively(_.isInstanceOf[LongType])) + assert(struct.existsRecursively(_.isInstanceOf[StructType])) + assert(!struct.existsRecursively(_.isInstanceOf[IntegerType])) + + val mapType = MapType(struct, StringType) + assert(mapType.existsRecursively(_.isInstanceOf[LongType])) + assert(mapType.existsRecursively(_.isInstanceOf[StructType])) + assert(mapType.existsRecursively(_.isInstanceOf[StringType])) + assert(mapType.existsRecursively(_.isInstanceOf[MapType])) + assert(!mapType.existsRecursively(_.isInstanceOf[IntegerType])) + + val arrayType = ArrayType(mapType) + assert(arrayType.existsRecursively(_.isInstanceOf[LongType])) + assert(arrayType.existsRecursively(_.isInstanceOf[StructType])) + assert(arrayType.existsRecursively(_.isInstanceOf[StringType])) + assert(arrayType.existsRecursively(_.isInstanceOf[MapType])) + assert(arrayType.existsRecursively(_.isInstanceOf[ArrayType])) + assert(!arrayType.existsRecursively(_.isInstanceOf[IntegerType])) + } + def checkDataTypeJsonRepr(dataType: DataType): Unit = { test(s"JSON - $dataType") { assert(DataType.fromJson(dataType.json) === dataType) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala index 417df006ab7c..ed2c641d63e2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala @@ -46,6 +46,25 @@ object DataTypeTestUtils { */ val numericTypes: Set[NumericType] = integralType ++ fractionalTypes + // TODO: remove this once we find out how to handle decimal properly in property check + val numericTypeWithoutDecimal: Set[DataType] = integralType ++ Set(DoubleType, FloatType) + + /** + * Instances of all [[NumericType]]s and [[CalendarIntervalType]] + */ + val numericAndInterval: Set[DataType] = numericTypeWithoutDecimal + CalendarIntervalType + + /** + * All the types that support ordering + */ + val ordered: Set[DataType] = + numericTypeWithoutDecimal + BooleanType + TimestampType + DateType + StringType + BinaryType + + /** + * All the types that we can use in a property check + */ + val propertyCheckSupported: Set[DataType] = ordered + /** * Instances of all [[AtomicType]]s. */ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala index 1d297beb3868..6921d15958a5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala @@ -166,6 +166,27 @@ class DecimalSuite extends SparkFunSuite with PrivateMethodTester { assert(Decimal(100) % Decimal(0) === null) } + // regression test for SPARK-8359 + test("accurate precision after multiplication") { + val decimal = (Decimal(Long.MaxValue, 38, 0) * Decimal(Long.MaxValue, 38, 0)).toJavaBigDecimal + assert(decimal.unscaledValue.toString === "85070591730234615847396907784232501249") + } + + // regression test for SPARK-8677 + test("fix non-terminating decimal expansion problem") { + val decimal = Decimal(1.0, 10, 3) / Decimal(3.0, 10, 3) + // The difference between decimal should not be more than 0.001. + assert(decimal.toDouble - 0.333 < 0.001) + } + + // regression test for SPARK-8800 + test("fix loss of precision/scale when doing division operation") { + val a = Decimal(2) / Decimal(3) + assert(a.toDouble < 1.0 && a.toDouble > 0.6) + val b = Decimal(1) / Decimal(8) + assert(b.toDouble === 0.125) + } + test("set/setOrNull") { assert(new Decimal().set(10L, 10, 0).toUnscaledLong === 10L) assert(new Decimal().set(100L, 10, 0).toUnscaledLong === 100L) diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 349007789f63..82f0dc328bc2 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java index 9e2c9334a7be..09511ff35f78 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java @@ -19,6 +19,8 @@ import java.io.IOException; +import com.google.common.annotations.VisibleForTesting; + import org.apache.spark.SparkEnv; import org.apache.spark.shuffle.ShuffleMemoryManager; import org.apache.spark.sql.catalyst.InternalRow; @@ -27,7 +29,7 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.KVIterator; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.map.BytesToBytesMap; import org.apache.spark.unsafe.memory.MemoryLocation; import org.apache.spark.unsafe.memory.TaskMemoryManager; @@ -72,7 +74,7 @@ public final class UnsafeFixedWidthAggregationMap { */ public static boolean supportsAggregationBufferSchema(StructType schema) { for (StructField field: schema.fields()) { - if (!UnsafeRow.isFixedLength(field.dataType())) { + if (!UnsafeRow.isMutable(field.dataType())) { return false; } } @@ -111,8 +113,6 @@ public UnsafeFixedWidthAggregationMap( // Initialize the buffer for aggregation value final UnsafeProjection valueProjection = UnsafeProjection.create(aggregationBufferSchema); this.emptyAggregationBuffer = valueProjection.apply(emptyAggregationBuffer).getBytes(); - assert(this.emptyAggregationBuffer.length == aggregationBufferSchema.length() * 8 + - UnsafeRow.calculateBitSetWidthInBytes(aggregationBufferSchema.length())); } /** @@ -123,6 +123,10 @@ public UnsafeFixedWidthAggregationMap( public UnsafeRow getAggregationBuffer(InternalRow groupingKey) { final UnsafeRow unsafeGroupingKeyRow = this.groupingKeyProjection.apply(groupingKey); + return getAggregationBufferFromUnsafeRow(unsafeGroupingKeyRow); + } + + public UnsafeRow getAggregationBufferFromUnsafeRow(UnsafeRow unsafeGroupingKeyRow) { // Probe our map using the serialized key final BytesToBytesMap.Location loc = map.lookup( unsafeGroupingKeyRow.getBaseObject(), @@ -136,7 +140,7 @@ public UnsafeRow getAggregationBuffer(InternalRow groupingKey) { unsafeGroupingKeyRow.getBaseOffset(), unsafeGroupingKeyRow.getSizeInBytes(), emptyAggregationBuffer, - PlatformDependent.BYTE_ARRAY_OFFSET, + Platform.BYTE_ARRAY_OFFSET, emptyAggregationBuffer.length ); if (!putSucceeded) { @@ -156,14 +160,17 @@ public UnsafeRow getAggregationBuffer(InternalRow groupingKey) { } /** - * Returns an iterator over the keys and values in this map. + * Returns an iterator over the keys and values in this map. This uses destructive iterator of + * BytesToBytesMap. So it is illegal to call any other method on this map after `iterator()` has + * been called. * * For efficiency, each call returns the same object. */ public KVIterator iterator() { return new KVIterator() { - private final BytesToBytesMap.BytesToBytesMapIterator mapLocationIterator = map.iterator(); + private final BytesToBytesMap.BytesToBytesMapIterator mapLocationIterator = + map.destructiveIterator(); private final UnsafeRow key = new UnsafeRow(); private final UnsafeRow value = new UnsafeRow(); @@ -208,6 +215,18 @@ public void close() { }; } + /** + * Return the peak memory used so far, in bytes. + */ + public long getPeakMemoryUsedBytes() { + return map.getPeakMemoryUsedBytes(); + } + + @VisibleForTesting + public int getNumDataPages() { + return map.getNumDataPages(); + } + /** * Free the memory associated with this map. This is idempotent and can be called multiple times. */ diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java index f6b017686306..7db6b7ff50f2 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java @@ -31,7 +31,7 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.storage.BlockManager; import org.apache.spark.unsafe.KVIterator; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.map.BytesToBytesMap; import org.apache.spark.unsafe.memory.MemoryBlock; import org.apache.spark.unsafe.memory.TaskMemoryManager; @@ -82,11 +82,17 @@ public UnsafeKVExternalSorter(StructType keySchema, StructType valueSchema, pageSizeBytes); } else { // Insert the records into the in-memory sorter. + // We will use the number of elements in the map as the initialSize of the + // UnsafeInMemorySorter. Because UnsafeInMemorySorter does not accept 0 as the initialSize, + // we will use 1 as its initial size if the map is empty. final UnsafeInMemorySorter inMemSorter = new UnsafeInMemorySorter( - taskMemoryManager, recordComparator, prefixComparator, map.numElements()); + taskMemoryManager, recordComparator, prefixComparator, Math.max(1, map.numElements())); - final int numKeyFields = keySchema.size(); + // We cannot use the destructive iterator here because we are reusing the existing memory + // pages in BytesToBytesMap to hold records during sorting. + // The only new memory we are allocating is the pointer/prefix array. BytesToBytesMap.BytesToBytesMapIterator iter = map.iterator(); + final int numKeyFields = keySchema.size(); UnsafeRow row = new UnsafeRow(); while (iter.hasNext()) { final BytesToBytesMap.Location loc = iter.next(); @@ -134,7 +140,11 @@ public void insertKV(UnsafeRow key, UnsafeRow value) throws IOException { value.getBaseObject(), value.getBaseOffset(), value.getSizeInBytes(), prefix); } - public KVIterator sortedIterator() throws IOException { + /** + * Returns a sorted iterator. It is the caller's responsibility to call `cleanupResources()` + * after consuming this iterator. + */ + public KVSorterIterator sortedIterator() throws IOException { try { final UnsafeSorterIterator underlying = sorter.getSortedIterator(); if (!underlying.hasNext()) { @@ -142,64 +152,20 @@ public KVIterator sortedIterator() throws IOException { // here in order to prevent memory leaks. cleanupResources(); } - - return new KVIterator() { - private UnsafeRow key = new UnsafeRow(); - private UnsafeRow value = new UnsafeRow(); - private int numKeyFields = keySchema.size(); - private int numValueFields = valueSchema.size(); - - @Override - public boolean next() throws IOException { - try { - if (underlying.hasNext()) { - underlying.loadNext(); - - Object baseObj = underlying.getBaseObject(); - long recordOffset = underlying.getBaseOffset(); - int recordLen = underlying.getRecordLength(); - - // Note that recordLen = keyLen + valueLen + 4 bytes (for the keyLen itself) - int keyLen = PlatformDependent.UNSAFE.getInt(baseObj, recordOffset); - int valueLen = recordLen - keyLen - 4; - - key.pointTo(baseObj, recordOffset + 4, numKeyFields, keyLen); - value.pointTo(baseObj, recordOffset + 4 + keyLen, numValueFields, valueLen); - - return true; - } else { - key = null; - value = null; - cleanupResources(); - return false; - } - } catch (IOException e) { - cleanupResources(); - throw e; - } - } - - @Override - public UnsafeRow getKey() { - return key; - } - - @Override - public UnsafeRow getValue() { - return value; - } - - @Override - public void close() { - cleanupResources(); - } - }; + return new KVSorterIterator(underlying); } catch (IOException e) { cleanupResources(); throw e; } } + /** + * Return the peak memory used so far, in bytes. + */ + public long getPeakMemoryUsedBytes() { + return sorter.getPeakMemoryUsedBytes(); + } + /** * Marks the current page as no-more-space-available, and as a result, either allocate a * new page or spill when we see the next record. @@ -209,8 +175,11 @@ void closeCurrentPage() { sorter.closeCurrentPage(); } - private void cleanupResources() { - sorter.freeMemory(); + /** + * Frees this sorter's in-memory data structures and cleans up its spill files. + */ + public void cleanupResources() { + sorter.cleanupResources(); } private static final class KVComparator extends RecordComparator { @@ -233,4 +202,60 @@ public int compare(Object baseObj1, long baseOff1, Object baseObj2, long baseOff return ordering.compare(row1, row2); } } + + public class KVSorterIterator extends KVIterator { + private UnsafeRow key = new UnsafeRow(); + private UnsafeRow value = new UnsafeRow(); + private final int numKeyFields = keySchema.size(); + private final int numValueFields = valueSchema.size(); + private final UnsafeSorterIterator underlying; + + private KVSorterIterator(UnsafeSorterIterator underlying) { + this.underlying = underlying; + } + + @Override + public boolean next() throws IOException { + try { + if (underlying.hasNext()) { + underlying.loadNext(); + + Object baseObj = underlying.getBaseObject(); + long recordOffset = underlying.getBaseOffset(); + int recordLen = underlying.getRecordLength(); + + // Note that recordLen = keyLen + valueLen + 4 bytes (for the keyLen itself) + int keyLen = Platform.getInt(baseObj, recordOffset); + int valueLen = recordLen - keyLen - 4; + key.pointTo(baseObj, recordOffset + 4, numKeyFields, keyLen); + value.pointTo(baseObj, recordOffset + 4 + keyLen, numValueFields, valueLen); + + return true; + } else { + key = null; + value = null; + cleanupResources(); + return false; + } + } catch (IOException e) { + cleanupResources(); + throw e; + } + } + + @Override + public UnsafeRow getKey() { + return key; + } + + @Override + public UnsafeRow getValue() { + return value; + } + + @Override + public void close() { + cleanupResources(); + } + }; } diff --git a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister new file mode 100644 index 000000000000..ca50000b4756 --- /dev/null +++ b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -0,0 +1,3 @@ +org.apache.spark.sql.execution.datasources.jdbc.DefaultSource +org.apache.spark.sql.execution.datasources.json.DefaultSource +org.apache.spark.sql.execution.datasources.parquet.DefaultSource diff --git a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css new file mode 100644 index 000000000000..ddd3a91dd8ef --- /dev/null +++ b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#plan-viz-graph .label { + font-weight: normal; + text-shadow: none; +} + +#plan-viz-graph svg g.node rect { + fill: #C3EBFF; + stroke: #3EC0FF; + stroke-width: 1px; +} + +/* Hightlight the SparkPlan node name */ +#plan-viz-graph svg text :first-child { + font-weight: bold; +} + +#plan-viz-graph svg path { + stroke: #444; + stroke-width: 1.5px; +} diff --git a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js new file mode 100644 index 000000000000..5161fcde669e --- /dev/null +++ b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +var PlanVizConstants = { + svgMarginX: 16, + svgMarginY: 16 +}; + +function renderPlanViz() { + var svg = planVizContainer().append("svg"); + var metadata = d3.select("#plan-viz-metadata"); + var dot = metadata.select(".dot-file").text().trim(); + var graph = svg.append("g"); + + var g = graphlibDot.read(dot); + preprocessGraphLayout(g); + var renderer = new dagreD3.render(); + renderer(graph, g); + + // Round corners on rectangles + svg + .selectAll("rect") + .attr("rx", "5") + .attr("ry", "5"); + + var nodeSize = parseInt($("#plan-viz-metadata-size").text()); + for (var i = 0; i < nodeSize; i++) { + setupTooltipForSparkPlanNode(i); + } + + resizeSvg(svg) +} + +/* -------------------- * + * | Helper functions | * + * -------------------- */ + +function planVizContainer() { return d3.select("#plan-viz-graph"); } + +/* + * Set up the tooltip for a SparkPlan node using metadata. When the user moves the mouse on the + * node, it will display the details of this SparkPlan node in the right. + */ +function setupTooltipForSparkPlanNode(nodeId) { + var nodeTooltip = d3.select("#plan-meta-data-" + nodeId).text() + d3.select("svg g .node_" + nodeId) + .on('mouseover', function(d) { + var domNode = d3.select(this).node(); + $(domNode).tooltip({ + title: nodeTooltip, trigger: "manual", container: "body", placement: "right" + }); + $(domNode).tooltip("show"); + }) + .on('mouseout', function(d) { + var domNode = d3.select(this).node(); + $(domNode).tooltip("destroy"); + }) +} + +/* + * Helper function to pre-process the graph layout. + * This step is necessary for certain styles that affect the positioning + * and sizes of graph elements, e.g. padding, font style, shape. + */ +function preprocessGraphLayout(g) { + var nodes = g.nodes(); + for (var i = 0; i < nodes.length; i++) { + var node = g.node(nodes[i]); + node.padding = "5"; + } + // Curve the edges + var edges = g.edges(); + for (var j = 0; j < edges.length; j++) { + var edge = g.edge(edges[j]); + edge.lineInterpolate = "basis"; + } +} + +/* + * Helper function to size the SVG appropriately such that all elements are displayed. + * This assumes that all outermost elements are clusters (rectangles). + */ +function resizeSvg(svg) { + var allClusters = svg.selectAll("g rect")[0]; + console.log(allClusters); + var startX = -PlanVizConstants.svgMarginX + + toFloat(d3.min(allClusters, function(e) { + console.log(e); + return getAbsolutePosition(d3.select(e)).x; + })); + var startY = -PlanVizConstants.svgMarginY + + toFloat(d3.min(allClusters, function(e) { + return getAbsolutePosition(d3.select(e)).y; + })); + var endX = PlanVizConstants.svgMarginX + + toFloat(d3.max(allClusters, function(e) { + var t = d3.select(e); + return getAbsolutePosition(t).x + toFloat(t.attr("width")); + })); + var endY = PlanVizConstants.svgMarginY + + toFloat(d3.max(allClusters, function(e) { + var t = d3.select(e); + return getAbsolutePosition(t).y + toFloat(t.attr("height")); + })); + var width = endX - startX; + var height = endY - startY; + svg.attr("viewBox", startX + " " + startY + " " + width + " " + height) + .attr("width", width) + .attr("height", height); +} + +/* Helper function to convert attributes to numeric values. */ +function toFloat(f) { + if (f) { + return parseFloat(f.toString().replace(/px$/, "")); + } else { + return f; + } +} + +/* + * Helper function to compute the absolute position of the specified element in our graph. + */ +function getAbsolutePosition(d3selection) { + if (d3selection.empty()) { + throw "Attempted to get absolute position of an empty selection."; + } + var obj = d3selection; + var _x = toFloat(obj.attr("x")) || 0; + var _y = toFloat(obj.attr("y")) || 0; + while (!obj.empty()) { + var transformText = obj.attr("transform"); + if (transformText) { + var translate = d3.transform(transformText).translate; + _x += toFloat(translate[0]); + _y += toFloat(translate[1]); + } + // Climb upwards to find how our parents are translated + obj = d3.select(obj.node().parentNode); + // Stop when we've reached the graph container itself + if (obj.node() == planVizContainer().node()) { + break; + } + } + return { x: _x, y: _y }; +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index b25dcbca82b9..807bc8c30c12 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -54,7 +54,7 @@ class Column(protected[sql] val expr: Expression) extends Logging { def this(name: String) = this(name match { case "*" => UnresolvedStar(None) case _ if name.endsWith(".*") => UnresolvedStar(Some(name.substring(0, name.length - 2))) - case _ => UnresolvedAttribute(name) + case _ => UnresolvedAttribute.quotedString(name) }) /** Creates a column based on the given expression. */ @@ -627,8 +627,19 @@ class Column(protected[sql] val expr: Expression) extends Logging { * @group expr_ops * @since 1.3.0 */ + @deprecated("use isin", "1.5.0") @scala.annotation.varargs - def in(list: Any*): Column = In(expr, list.map(lit(_).expr)) + def in(list: Any*): Column = isin(list : _*) + + /** + * A boolean expression that is evaluated to true if the value of this expression is contained + * by the evaluated values of the arguments. + * + * @group expr_ops + * @since 1.5.0 + */ + @scala.annotation.varargs + def isin(list: Any*): Column = In(expr, list.map(lit(_).expr)) /** * SQL like expression. @@ -742,10 +753,16 @@ class Column(protected[sql] val expr: Expression) extends Logging { * df.select($"colA".as("colB")) * }}} * + * If the current column has metadata associated with it, this metadata will be propagated + * to the new column. If this not desired, use `as` with explicitly empty metadata. + * * @group expr_ops * @since 1.3.0 */ - def as(alias: String): Column = Alias(expr, alias)() + def as(alias: String): Column = expr match { + case ne: NamedExpression => Alias(expr, alias)(explicitMetadata = Some(ne.metadata)) + case other => Alias(other, alias)() + } /** * (Scala-specific) Assigns the given aliases to the results of a table generating function. @@ -778,10 +795,16 @@ class Column(protected[sql] val expr: Expression) extends Logging { * df.select($"colA".as('colB)) * }}} * + * If the current column has metadata associated with it, this metadata will be propagated + * to the new column. If this not desired, use `as` with explicitly empty metadata. + * * @group expr_ops * @since 1.3.0 */ - def as(alias: Symbol): Column = Alias(expr, alias.name)() + def as(alias: Symbol): Column = expr match { + case ne: NamedExpression => Alias(expr, alias.name)(explicitMetadata = Some(ne.metadata)) + case other => Alias(other, alias.name)() + } /** * Gives the column an alias with metadata. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 3ea0f9ed3bdd..ae341c86a61e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -20,8 +20,6 @@ package org.apache.spark.sql import java.io.CharArrayWriter import java.util.Properties -import org.apache.spark.unsafe.types.UTF8String - import scala.language.implicitConversions import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag @@ -36,12 +34,12 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.logical.{Filter, _} +import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.{Inner, JoinType} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser} -import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, LogicalRDD} +import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, FileRelation, LogicalRDD, SQLExecution} import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation} -import org.apache.spark.sql.json.{JacksonGenerator, JSONRelation} +import org.apache.spark.sql.execution.datasources.json.JacksonGenerator import org.apache.spark.sql.sources.HadoopFsRelation import org.apache.spark.sql.types._ import org.apache.spark.storage.StorageLevel @@ -54,7 +52,6 @@ private[sql] object DataFrame { } } - /** * :: Experimental :: * A distributed collection of data organized into named columns. @@ -119,6 +116,9 @@ class DataFrame private[sql]( @transient val sqlContext: SQLContext, @DeveloperApi @transient val queryExecution: SQLContext#QueryExecution) extends Serializable { + // Note for Spark contributors: if adding or updating any action in `DataFrame`, please make sure + // you wrap it with `withNewExecutionId` if this actions doesn't call other action. + /** * A constructor that automatically analyzes the logical plan. * @@ -168,7 +168,7 @@ class DataFrame private[sql]( } /** - * Internal API for Python + * Compose the string representing rows for output * @param _numRows Number of rows to show * @param truncate Whether truncate long strings and align cells right */ @@ -634,6 +634,7 @@ class DataFrame private[sql]( /** * Selects column based on the column name and return it as a [[Column]]. + * Note that the column name can also reference to a nested column like `a.b`. * @group dfops * @since 1.3.0 */ @@ -641,6 +642,7 @@ class DataFrame private[sql]( /** * Selects column based on the column name and return it as a [[Column]]. + * Note that the column name can also reference to a nested column like `a.b`. * @group dfops * @since 1.3.0 */ @@ -682,14 +684,12 @@ class DataFrame private[sql]( // make it a NamedExpression. case Column(u: UnresolvedAttribute) => UnresolvedAlias(u) case Column(expr: NamedExpression) => expr - // Leave an unaliased explode with an empty list of names since the analzyer will generate the + // Leave an unaliased explode with an empty list of names since the analyzer will generate the // correct defaults after the nested expression's type has been resolved. case Column(explode: Explode) => MultiAlias(explode, Nil) case Column(expr: Expression) => Alias(expr, expr.prettyString)() } - // When user continuously call `select`, speed up analysis by collapsing `Project` - import org.apache.spark.sql.catalyst.optimizer.ProjectCollapsing - Project(namedExpressions.toSeq, ProjectCollapsing(logicalPlan)) + Project(namedExpressions.toSeq, logicalPlan) } /** @@ -1133,7 +1133,8 @@ class DataFrame private[sql]( ///////////////////////////////////////////////////////////////////////////// /** - * Returns a new [[DataFrame]] by adding a column. + * Returns a new [[DataFrame]] by adding a column or replacing the existing column that has + * the same name. * @group dfops * @since 1.3.0 */ @@ -1356,14 +1357,18 @@ class DataFrame private[sql]( * @group rdd * @since 1.3.0 */ - def foreach(f: Row => Unit): Unit = rdd.foreach(f) + def foreach(f: Row => Unit): Unit = withNewExecutionId { + rdd.foreach(f) + } /** * Applies a function f to each partition of this [[DataFrame]]. * @group rdd * @since 1.3.0 */ - def foreachPartition(f: Iterator[Row] => Unit): Unit = rdd.foreachPartition(f) + def foreachPartition(f: Iterator[Row] => Unit): Unit = withNewExecutionId { + rdd.foreachPartition(f) + } /** * Returns the first `n` rows in the [[DataFrame]]. @@ -1377,14 +1382,18 @@ class DataFrame private[sql]( * @group action * @since 1.3.0 */ - def collect(): Array[Row] = queryExecution.executedPlan.executeCollect() + def collect(): Array[Row] = withNewExecutionId { + queryExecution.executedPlan.executeCollect() + } /** * Returns a Java list that contains all of [[Row]]s in this [[DataFrame]]. * @group action * @since 1.3.0 */ - def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(rdd.collect() : _*) + def collectAsList(): java.util.List[Row] = withNewExecutionId { + java.util.Arrays.asList(rdd.collect() : _*) + } /** * Returns the number of rows in the [[DataFrame]]. @@ -1554,10 +1563,10 @@ class DataFrame private[sql]( */ def inputFiles: Array[String] = { val files: Seq[String] = logicalPlan.collect { - case LogicalRelation(fsBasedRelation: HadoopFsRelation) => - fsBasedRelation.paths.toSeq - case LogicalRelation(jsonRelation: JSONRelation) => - jsonRelation.path.toSeq + case LogicalRelation(fsBasedRelation: FileRelation) => + fsBasedRelation.inputFiles + case fr: FileRelation => + fr.inputFiles }.flatten files.toSet.toArray } @@ -1643,8 +1652,12 @@ class DataFrame private[sql]( * an RDD out to a parquet file, and then register that file as a table. This "table" can then * be the target of an `insertInto`. * - * Also note that while this function can persist the table metadata into Hive's metastore, - * the table will NOT be accessible from Hive, until SPARK-7550 is resolved. + * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input + * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC + * and Parquet), the table is persisted in a Hive compatible format, which means other systems + * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL + * specific format. + * * @group output * @deprecated As of 1.4.0, replaced by `write().saveAsTable(tableName)`. */ @@ -1662,8 +1675,12 @@ class DataFrame private[sql]( * an RDD out to a parquet file, and then register that file as a table. This "table" can then * be the target of an `insertInto`. * - * Also note that while this function can persist the table metadata into Hive's metastore, - * the table will NOT be accessible from Hive, until SPARK-7550 is resolved. + * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input + * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC + * and Parquet), the table is persisted in a Hive compatible format, which means other systems + * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL + * specific format. + * * @group output * @deprecated As of 1.4.0, replaced by `write().mode(mode).saveAsTable(tableName)`. */ @@ -1682,8 +1699,12 @@ class DataFrame private[sql]( * an RDD out to a parquet file, and then register that file as a table. This "table" can then * be the target of an `insertInto`. * - * Also note that while this function can persist the table metadata into Hive's metastore, - * the table will NOT be accessible from Hive, until SPARK-7550 is resolved. + * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input + * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC + * and Parquet), the table is persisted in a Hive compatible format, which means other systems + * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL + * specific format. + * * @group output * @deprecated As of 1.4.0, replaced by `write().format(source).saveAsTable(tableName)`. */ @@ -1702,8 +1723,12 @@ class DataFrame private[sql]( * an RDD out to a parquet file, and then register that file as a table. This "table" can then * be the target of an `insertInto`. * - * Also note that while this function can persist the table metadata into Hive's metastore, - * the table will NOT be accessible from Hive, until SPARK-7550 is resolved. + * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input + * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC + * and Parquet), the table is persisted in a Hive compatible format, which means other systems + * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL + * specific format. + * * @group output * @deprecated As of 1.4.0, replaced by `write().mode(mode).saveAsTable(tableName)`. */ @@ -1721,8 +1746,12 @@ class DataFrame private[sql]( * an RDD out to a parquet file, and then register that file as a table. This "table" can then * be the target of an `insertInto`. * - * Also note that while this function can persist the table metadata into Hive's metastore, - * the table will NOT be accessible from Hive, until SPARK-7550 is resolved. + * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input + * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC + * and Parquet), the table is persisted in a Hive compatible format, which means other systems + * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL + * specific format. + * * @group output * @deprecated As of 1.4.0, replaced by * `write().format(source).mode(mode).options(options).saveAsTable(tableName)`. @@ -1747,8 +1776,12 @@ class DataFrame private[sql]( * an RDD out to a parquet file, and then register that file as a table. This "table" can then * be the target of an `insertInto`. * - * Also note that while this function can persist the table metadata into Hive's metastore, - * the table will NOT be accessible from Hive, until SPARK-7550 is resolved. + * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input + * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC + * and Parquet), the table is persisted in a Hive compatible format, which means other systems + * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL + * specific format. + * * @group output * @deprecated As of 1.4.0, replaced by * `write().format(source).mode(mode).options(options).saveAsTable(tableName)`. @@ -1863,6 +1896,14 @@ class DataFrame private[sql]( write.mode(SaveMode.Append).insertInto(tableName) } + /** + * Wrap a DataFrame action to track all Spark jobs in the body so that we can connect them with + * an execution. + */ + private[sql] def withNewExecutionId[T](body: => T): T = { + SQLExecution.withNewExecutionId(sqlContext, queryExecution)(body) + } + //////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// // End of deprecated methods diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala index ea85f0657a72..a4fd4cf3b330 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala @@ -122,7 +122,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { def drop(minNonNulls: Int, cols: Seq[String]): DataFrame = { // Filtering condition: // only keep the row if it has at least `minNonNulls` non-null and non-NaN values. - val predicate = AtLeastNNonNullNans(minNonNulls, cols.map(name => df.resolve(name))) + val predicate = AtLeastNNonNulls(minNonNulls, cols.map(name => df.resolve(name))) df.filter(Column(predicate)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index eb09807f9d9c..6dc7bfe33349 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -25,10 +25,10 @@ import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaRDD import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.RDD +import org.apache.spark.sql.execution.datasources.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation} +import org.apache.spark.sql.execution.datasources.json.JSONRelation +import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation import org.apache.spark.sql.execution.datasources.{LogicalRelation, ResolvedDataSource} -import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation} -import org.apache.spark.sql.json.JSONRelation -import org.apache.spark.sql.parquet.ParquetRelation import org.apache.spark.sql.types.StructType import org.apache.spark.{Logging, Partition} @@ -197,7 +197,13 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging { table: String, parts: Array[Partition], connectionProperties: Properties): DataFrame = { - val relation = JDBCRelation(url, table, parts, connectionProperties)(sqlContext) + val props = new Properties() + extraOptions.foreach { case (key, value) => + props.put(key, value) + } + // connectionProperties should override settings in extraOptions + props.putAll(connectionProperties) + val relation = JDBCRelation(url, table, parts, props)(sqlContext) sqlContext.baseRelationToDataFrame(relation) } @@ -237,7 +243,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging { def json(jsonRDD: RDD[String]): DataFrame = { val samplingRatio = extraOptions.getOrElse("samplingRatio", "1.0").toDouble sqlContext.baseRelationToDataFrame( - new JSONRelation(() => jsonRDD, None, samplingRatio, userSpecifiedSchema)(sqlContext)) + new JSONRelation(Some(jsonRDD), samplingRatio, userSpecifiedSchema, None, None)(sqlContext)) } /** @@ -260,7 +266,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging { sqlContext.baseRelationToDataFrame( new ParquetRelation( - globbedPaths.map(_.toString), None, None, extraOptions.toMap)(sqlContext)) + globbedPaths.map(_.toString), userSpecifiedSchema, None, extraOptions.toMap)(sqlContext)) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 2e68e358f2f1..69c984717526 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -39,6 +39,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param col2 the name of the second column * @return the covariance of the two columns. * + * {{{ + * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10)) + * .withColumn("rand2", rand(seed=27)) + * df.stat.cov("rand1", "rand2") + * res1: Double = 0.065... + * }}} + * * @since 1.4.0 */ def cov(col1: String, col2: String): Double = { @@ -54,6 +61,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param col2 the name of the column to calculate the correlation against * @return The Pearson Correlation Coefficient as a Double. * + * {{{ + * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10)) + * .withColumn("rand2", rand(seed=27)) + * df.stat.corr("rand1", "rand2") + * res1: Double = 0.613... + * }}} + * * @since 1.4.0 */ def corr(col1: String, col2: String, method: String): Double = { @@ -69,6 +83,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param col2 the name of the column to calculate the correlation against * @return The Pearson Correlation Coefficient as a Double. * + * {{{ + * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10)) + * .withColumn("rand2", rand(seed=27)) + * df.stat.corr("rand1", "rand2", "pearson") + * res1: Double = 0.613... + * }}} + * * @since 1.4.0 */ def corr(col1: String, col2: String): Double = { @@ -92,6 +113,20 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * of the DataFrame. * @return A DataFrame containing for the contingency table. * + * {{{ + * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), + * (3, 3))).toDF("key", "value") + * val ct = df.stat.crosstab("key", "value") + * ct.show() + * +---------+---+---+---+ + * |key_value| 1| 2| 3| + * +---------+---+---+---+ + * | 2| 2| 0| 1| + * | 1| 1| 1| 0| + * | 3| 0| 1| 1| + * +---------+---+---+---+ + * }}} + * * @since 1.4.0 */ def crosstab(col1: String, col2: String): DataFrame = { @@ -112,6 +147,32 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * than 1e-4. * @return A Local DataFrame with the Array of frequent items for each column. * + * {{{ + * val rows = Seq.tabulate(100) { i => + * if (i % 2 == 0) (1, -1.0) else (i, i * -1.0) + * } + * val df = sqlContext.createDataFrame(rows).toDF("a", "b") + * // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns + * // "a" and "b" + * val freqSingles = df.stat.freqItems(Array("a", "b"), 0.4) + * freqSingles.show() + * +-----------+-------------+ + * |a_freqItems| b_freqItems| + * +-----------+-------------+ + * | [1, 99]|[-1.0, -99.0]| + * +-----------+-------------+ + * // find the pair of items with a frequency greater than 0.1 in columns "a" and "b" + * val pairDf = df.select(struct("a", "b").as("a-b")) + * val freqPairs = pairDf.stat.freqItems(Array("a-b"), 0.1) + * freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show() + * +----------+ + * | freq_ab| + * +----------+ + * | [1,-1.0]| + * | ... | + * +----------+ + * }}} + * * @since 1.4.0 */ def freqItems(cols: Array[String], support: Double): DataFrame = { @@ -147,6 +208,32 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param cols the names of the columns to search frequent items in. * @return A Local DataFrame with the Array of frequent items for each column. * + * {{{ + * val rows = Seq.tabulate(100) { i => + * if (i % 2 == 0) (1, -1.0) else (i, i * -1.0) + * } + * val df = sqlContext.createDataFrame(rows).toDF("a", "b") + * // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns + * // "a" and "b" + * val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4) + * freqSingles.show() + * +-----------+-------------+ + * |a_freqItems| b_freqItems| + * +-----------+-------------+ + * | [1, 99]|[-1.0, -99.0]| + * +-----------+-------------+ + * // find the pair of items with a frequency greater than 0.1 in columns "a" and "b" + * val pairDf = df.select(struct("a", "b").as("a-b")) + * val freqPairs = pairDf.stat.freqItems(Seq("a-b"), 0.1) + * freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show() + * +----------+ + * | freq_ab| + * +----------+ + * | [1,-1.0]| + * | ... | + * +----------+ + * }}} + * * @since 1.4.0 */ def freqItems(cols: Seq[String], support: Double): DataFrame = { @@ -180,6 +267,20 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @tparam T stratum type * @return a new [[DataFrame]] that represents the stratified sample * + * {{{ + * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), + * (3, 3))).toDF("key", "value") + * val fractions = Map(1 -> 1.0, 3 -> 0.5) + * df.stat.sampleBy("key", fractions, 36L).show() + * +---+-----+ + * |key|value| + * +---+-----+ + * | 1| 1| + * | 1| 2| + * | 3| 2| + * +---+-----+ + * }}} + * * @since 1.5.0 */ def sampleBy[T](col: String, fractions: Map[T, Double], seed: Long): DataFrame = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 7e3318cefe62..ce8744b53175 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -23,8 +23,9 @@ import org.apache.spark.annotation.Experimental import org.apache.spark.sql.catalyst.{SqlParser, TableIdentifier} import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable +import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, ResolvedDataSource} -import org.apache.spark.sql.jdbc.{JDBCWriteDetails, JdbcUtils} +import org.apache.spark.sql.sources.HadoopFsRelation /** @@ -185,6 +186,12 @@ final class DataFrameWriter private[sql](df: DataFrame) { * When `mode` is `Append`, the schema of the [[DataFrame]] need to be * the same as that of the existing table, and format or options will be ignored. * + * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input + * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC + * and Parquet), the table is persisted in a Hive compatible format, which means other systems + * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL + * specific format. + * * @since 1.4.0 */ def saveAsTable(tableName: String): Unit = { @@ -211,7 +218,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { case _ => val cmd = CreateTableUsingAsSelect( - tableIdent.unquotedString, + tableIdent, source, temporary = false, partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]), @@ -237,7 +244,13 @@ final class DataFrameWriter private[sql](df: DataFrame) { * should be included. */ def jdbc(url: String, table: String, connectionProperties: Properties): Unit = { - val conn = JdbcUtils.createConnection(url, connectionProperties) + val props = new Properties() + extraOptions.foreach { case (key, value) => + props.put(key, value) + } + // connectionProperties should override settings in extraOptions + props.putAll(connectionProperties) + val conn = JdbcUtils.createConnection(url, props) try { var tableExists = JdbcUtils.tableExists(conn, table) @@ -257,7 +270,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { // Create the table if the table didn't exist. if (!tableExists) { - val schema = JDBCWriteDetails.schemaString(df, url) + val schema = JdbcUtils.schemaString(df, url) val sql = s"CREATE TABLE $table ($schema)" conn.prepareStatement(sql).executeUpdate() } @@ -265,7 +278,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { conn.close() } - JDBCWriteDetails.saveTable(df, url, table, connectionProperties) + JdbcUtils.saveTable(df, url, table, props) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala index 387960c4b482..2974055a141b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala @@ -200,7 +200,7 @@ private[spark] object SQLConf { val IN_MEMORY_PARTITION_PRUNING = booleanConf("spark.sql.inMemoryColumnarStorage.partitionPruning", - defaultValue = Some(false), + defaultValue = Some(true), doc = "When true, enable partition pruning for in-memory columnar tables.", isPublic = false) @@ -223,14 +223,21 @@ private[spark] object SQLConf { defaultValue = Some(200), doc = "The default number of partitions to use when shuffling data for joins or aggregations.") - val CODEGEN_ENABLED = booleanConf("spark.sql.codegen", + val TUNGSTEN_ENABLED = booleanConf("spark.sql.tungsten.enabled", defaultValue = Some(true), + doc = "When true, use the optimized Tungsten physical execution backend which explicitly " + + "manages memory and dynamically generates bytecode for expression evaluation.") + + val CODEGEN_ENABLED = booleanConf("spark.sql.codegen", + defaultValue = Some(true), // use TUNGSTEN_ENABLED as default doc = "When true, code will be dynamically generated at runtime for expression evaluation in" + - " a specific query.") + " a specific query.", + isPublic = false) val UNSAFE_ENABLED = booleanConf("spark.sql.unsafe.enabled", - defaultValue = Some(true), - doc = "When true, use the new optimized Tungsten physical execution backend.") + defaultValue = Some(true), // use TUNGSTEN_ENABLED as default + doc = "When true, use the new optimized Tungsten physical execution backend.", + isPublic = false) val DIALECT = stringConf( "spark.sql.dialect", @@ -305,7 +312,7 @@ private[spark] object SQLConf { doc = "When true, enable filter pushdown for ORC files.") val HIVE_VERIFY_PARTITION_PATH = booleanConf("spark.sql.hive.verifyPartitionPath", - defaultValue = Some(true), + defaultValue = Some(false), doc = "") val HIVE_METASTORE_PARTITION_PRUNING = booleanConf("spark.sql.hive.metastorePartitionPruning", @@ -359,17 +366,21 @@ private[spark] object SQLConf { "storing additional schema information in Hive's metastore.", isPublic = false) - // Whether to perform partition discovery when loading external data sources. Default to true. val PARTITION_DISCOVERY_ENABLED = booleanConf("spark.sql.sources.partitionDiscovery.enabled", defaultValue = Some(true), doc = "When true, automtically discover data partitions.") - // Whether to perform partition column type inference. Default to true. val PARTITION_COLUMN_TYPE_INFERENCE = booleanConf("spark.sql.sources.partitionColumnTypeInference.enabled", defaultValue = Some(true), doc = "When true, automatically infer the data types for partitioned columns.") + val PARTITION_MAX_FILES = + intConf("spark.sql.sources.maxConcurrentWrites", + defaultValue = Some(5), + doc = "The maximum number of concurent files to open before falling back on sorting when " + + "writing out files using dynamic partitioning.") + // The output committer class used by HadoopFsRelation. The specified class needs to be a // subclass of org.apache.hadoop.mapreduce.OutputCommitter. // @@ -409,14 +420,6 @@ private[spark] object SQLConf { val USE_SQL_AGGREGATE2 = booleanConf("spark.sql.useAggregate2", defaultValue = Some(true), doc = "") - val USE_SQL_SERIALIZER2 = booleanConf( - "spark.sql.useSerializer2", - defaultValue = Some(true), isPublic = false) - - val ADVANCED_SQL_OPTIMIZATION = booleanConf( - "spark.sql.advancedOptimization", - defaultValue = Some(true), isPublic = false) - object Deprecated { val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks" } @@ -431,7 +434,6 @@ private[spark] object SQLConf { * * SQLConf is thread-safe (internally synchronized, so safe to be used in multiple threads). */ - private[sql] class SQLConf extends Serializable with CatalystConf { import SQLConf._ @@ -478,18 +480,14 @@ private[sql] class SQLConf extends Serializable with CatalystConf { private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN) - private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED) + private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED, getConf(TUNGSTEN_ENABLED)) def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE) - private[spark] def unsafeEnabled: Boolean = getConf(UNSAFE_ENABLED) + private[spark] def unsafeEnabled: Boolean = getConf(UNSAFE_ENABLED, getConf(TUNGSTEN_ENABLED)) private[spark] def useSqlAggregate2: Boolean = getConf(USE_SQL_AGGREGATE2) - private[spark] def useSqlSerializer2: Boolean = getConf(USE_SQL_SERIALIZER2) - - private[spark] def advancedSqlOptimizations: Boolean = getConf(ADVANCED_SQL_OPTIMIZATION) - private[spark] def autoBroadcastJoinThreshold: Int = getConf(AUTO_BROADCASTJOIN_THRESHOLD) private[spark] def defaultSizeInBytes: Long = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 31e2b508d485..a1eea09e0477 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -23,7 +23,6 @@ import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConversions._ import scala.collection.immutable -import scala.language.implicitConversions import scala.reflect.runtime.universe.TypeTag import scala.util.control.NonFatal @@ -41,10 +40,9 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, _} import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.optimizer.FilterNullsInJoinKey +import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils /** @@ -75,6 +73,11 @@ class SQLContext(@transient val sparkContext: SparkContext) */ protected[sql] def conf = currentSession().conf + // `listener` should be only used in the driver + @transient private[sql] val listener = new SQLListener(this) + sparkContext.addSparkListener(listener) + sparkContext.ui.foreach(new SQLTab(this, _)) + /** * Set Spark SQL configuration properties. * @@ -157,9 +160,7 @@ class SQLContext(@transient val sparkContext: SparkContext) } @transient - protected[sql] lazy val optimizer: Optimizer = new DefaultOptimizer { - override val extendedOperatorOptimizationRules = FilterNullsInJoinKey(self) :: Nil - } + protected[sql] lazy val optimizer: Optimizer = DefaultOptimizer @transient protected[sql] val ddlParser = new DDLParser(sqlParser.parse(_)) @@ -288,9 +289,6 @@ class SQLContext(@transient val sparkContext: SparkContext) @transient val udf: UDFRegistration = new UDFRegistration(this) - @transient - val udaf: UDAFRegistration = new UDAFRegistration(this) - /** * Returns true if the table is currently cached in-memory. * @group cachemgmt @@ -334,101 +332,25 @@ class SQLContext(@transient val sparkContext: SparkContext) * @since 1.3.0 */ @Experimental - object implicits extends Serializable { - // scalastyle:on + object implicits extends SQLImplicits with Serializable { + protected override def _sqlContext: SQLContext = self /** * Converts $"col name" into an [[Column]]. * @since 1.3.0 */ + // This must live here to preserve binary compatibility with Spark < 1.5. implicit class StringToColumn(val sc: StringContext) { def $(args: Any*): ColumnName = { - new ColumnName(sc.s(args : _*)) - } - } - - /** - * An implicit conversion that turns a Scala `Symbol` into a [[Column]]. - * @since 1.3.0 - */ - implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name) - - /** - * Creates a DataFrame from an RDD of case classes or tuples. - * @since 1.3.0 - */ - implicit def rddToDataFrameHolder[A <: Product : TypeTag](rdd: RDD[A]): DataFrameHolder = { - DataFrameHolder(self.createDataFrame(rdd)) - } - - /** - * Creates a DataFrame from a local Seq of Product. - * @since 1.3.0 - */ - implicit def localSeqToDataFrameHolder[A <: Product : TypeTag](data: Seq[A]): DataFrameHolder = - { - DataFrameHolder(self.createDataFrame(data)) - } - - // Do NOT add more implicit conversions. They are likely to break source compatibility by - // making existing implicit conversions ambiguous. In particular, RDD[Double] is dangerous - // because of [[DoubleRDDFunctions]]. - - /** - * Creates a single column DataFrame from an RDD[Int]. - * @since 1.3.0 - */ - implicit def intRddToDataFrameHolder(data: RDD[Int]): DataFrameHolder = { - val dataType = IntegerType - val rows = data.mapPartitions { iter => - val row = new SpecificMutableRow(dataType :: Nil) - iter.map { v => - row.setInt(0, v) - row: InternalRow - } + new ColumnName(sc.s(args: _*)) } - DataFrameHolder( - self.internalCreateDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))) - } - - /** - * Creates a single column DataFrame from an RDD[Long]. - * @since 1.3.0 - */ - implicit def longRddToDataFrameHolder(data: RDD[Long]): DataFrameHolder = { - val dataType = LongType - val rows = data.mapPartitions { iter => - val row = new SpecificMutableRow(dataType :: Nil) - iter.map { v => - row.setLong(0, v) - row: InternalRow - } - } - DataFrameHolder( - self.internalCreateDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))) - } - - /** - * Creates a single column DataFrame from an RDD[String]. - * @since 1.3.0 - */ - implicit def stringRddToDataFrameHolder(data: RDD[String]): DataFrameHolder = { - val dataType = StringType - val rows = data.mapPartitions { iter => - val row = new SpecificMutableRow(dataType :: Nil) - iter.map { v => - row.update(0, UTF8String.fromString(v)) - row: InternalRow - } - } - DataFrameHolder( - self.internalCreateDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))) } } + // scalastyle:on /** * :: Experimental :: - * Creates a DataFrame from an RDD of case classes. + * Creates a DataFrame from an RDD of Product (e.g. case classes, tuples). * * @group dataframes * @since 1.3.0 @@ -662,9 +584,10 @@ class SQLContext(@transient val sparkContext: SparkContext) tableName: String, source: String, options: Map[String, String]): DataFrame = { + val tableIdent = new SqlParser().parseTableIdentifier(tableName) val cmd = CreateTableUsing( - tableName, + tableIdent, userSpecifiedSchema = None, source, temporary = false, @@ -672,7 +595,7 @@ class SQLContext(@transient val sparkContext: SparkContext) allowExisting = false, managedIfNoPath = false) executePlan(cmd).toRdd - table(tableName) + table(tableIdent) } /** @@ -707,9 +630,10 @@ class SQLContext(@transient val sparkContext: SparkContext) source: String, schema: StructType, options: Map[String, String]): DataFrame = { + val tableIdent = new SqlParser().parseTableIdentifier(tableName) val cmd = CreateTableUsing( - tableName, + tableIdent, userSpecifiedSchema = Some(schema), source, temporary = false, @@ -717,7 +641,7 @@ class SQLContext(@transient val sparkContext: SparkContext) allowExisting = false, managedIfNoPath = false) executePlan(cmd).toRdd - table(tableName) + table(tableIdent) } /** @@ -802,7 +726,10 @@ class SQLContext(@transient val sparkContext: SparkContext) * @since 1.3.0 */ def table(tableName: String): DataFrame = { - val tableIdent = new SqlParser().parseTableIdentifier(tableName) + table(new SqlParser().parseTableIdentifier(tableName)) + } + + private def table(tableIdent: TableIdentifier): DataFrame = { DataFrame(this, catalog.lookupRelation(tableIdent.toSeq)) } @@ -873,7 +800,7 @@ class SQLContext(@transient val sparkContext: SparkContext) HashAggregation :: Aggregation :: LeftSemiJoin :: - HashJoin :: + EquiJoinSelection :: InMemoryScans :: BasicOperators :: CartesianProduct :: @@ -1011,9 +938,6 @@ class SQLContext(@transient val sparkContext: SparkContext) def output = analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}").mkString(", ") - // TODO previously will output RDD details by run (${stringOrError(toRdd.toDebugString)}) - // however, the `toRdd` will cause the real execution, which is not what we want. - // We need to think about how to avoid the side effect. s"""== Parsed Logical Plan == |${stringOrError(logical)} |== Analyzed Logical Plan == @@ -1024,7 +948,6 @@ class SQLContext(@transient val sparkContext: SparkContext) |== Physical Plan == |${stringOrError(executedPlan)} |Code Generation: ${stringOrError(executedPlan.codegenEnabled)} - |== RDD == """.stripMargin.trim } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala new file mode 100644 index 000000000000..bf03c6108842 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import scala.language.implicitConversions +import scala.reflect.runtime.universe.TypeTag + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow +import org.apache.spark.sql.types.StructField +import org.apache.spark.unsafe.types.UTF8String + +/** + * A collection of implicit methods for converting common Scala objects into [[DataFrame]]s. + */ +private[sql] abstract class SQLImplicits { + protected def _sqlContext: SQLContext + + /** + * An implicit conversion that turns a Scala `Symbol` into a [[Column]]. + * @since 1.3.0 + */ + implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name) + + /** + * Creates a DataFrame from an RDD of Product (e.g. case classes, tuples). + * @since 1.3.0 + */ + implicit def rddToDataFrameHolder[A <: Product : TypeTag](rdd: RDD[A]): DataFrameHolder = { + DataFrameHolder(_sqlContext.createDataFrame(rdd)) + } + + /** + * Creates a DataFrame from a local Seq of Product. + * @since 1.3.0 + */ + implicit def localSeqToDataFrameHolder[A <: Product : TypeTag](data: Seq[A]): DataFrameHolder = + { + DataFrameHolder(_sqlContext.createDataFrame(data)) + } + + // Do NOT add more implicit conversions. They are likely to break source compatibility by + // making existing implicit conversions ambiguous. In particular, RDD[Double] is dangerous + // because of [[DoubleRDDFunctions]]. + + /** + * Creates a single column DataFrame from an RDD[Int]. + * @since 1.3.0 + */ + implicit def intRddToDataFrameHolder(data: RDD[Int]): DataFrameHolder = { + val dataType = IntegerType + val rows = data.mapPartitions { iter => + val row = new SpecificMutableRow(dataType :: Nil) + iter.map { v => + row.setInt(0, v) + row: InternalRow + } + } + DataFrameHolder( + _sqlContext.internalCreateDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))) + } + + /** + * Creates a single column DataFrame from an RDD[Long]. + * @since 1.3.0 + */ + implicit def longRddToDataFrameHolder(data: RDD[Long]): DataFrameHolder = { + val dataType = LongType + val rows = data.mapPartitions { iter => + val row = new SpecificMutableRow(dataType :: Nil) + iter.map { v => + row.setLong(0, v) + row: InternalRow + } + } + DataFrameHolder( + _sqlContext.internalCreateDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))) + } + + /** + * Creates a single column DataFrame from an RDD[String]. + * @since 1.3.0 + */ + implicit def stringRddToDataFrameHolder(data: RDD[String]): DataFrameHolder = { + val dataType = StringType + val rows = data.mapPartitions { iter => + val row = new SpecificMutableRow(dataType :: Nil) + iter.map { v => + row.update(0, UTF8String.fromString(v)) + row: InternalRow + } + } + DataFrameHolder( + _sqlContext.internalCreateDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index 7cd7421a518c..fc4d0938c533 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -26,6 +26,8 @@ import org.apache.spark.Logging import org.apache.spark.sql.api.java._ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF} +import org.apache.spark.sql.execution.aggregate.ScalaUDAF +import org.apache.spark.sql.expressions.UserDefinedAggregateFunction import org.apache.spark.sql.types.DataType /** @@ -52,6 +54,21 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging { functionRegistry.registerFunction(name, udf.builder) } + /** + * Register a user-defined aggregate function (UDAF). + * + * @param name the name of the UDAF. + * @param udaf the UDAF needs to be registered. + * @return the registered UDAF. + */ + def register( + name: String, + udaf: UserDefinedAggregateFunction): UserDefinedAggregateFunction = { + def builder(children: Seq[Expression]) = ScalaUDAF(children, udaf) + functionRegistry.registerFunction(name, builder) + udaf + } + // scalastyle:off /* register 0-22 were generated by this script diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala index af1a8ecca9b5..5cbd52bc0590 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.columnar import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} +import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -66,7 +66,7 @@ private[sql] sealed trait ColumnStats extends Serializable { * Column statistics represented as a single row, currently including closed lower bound, closed * upper bound and null count. */ - def collectedStatistics: InternalRow + def collectedStatistics: GenericInternalRow } /** @@ -75,7 +75,8 @@ private[sql] sealed trait ColumnStats extends Serializable { private[sql] class NoopColumnStats extends ColumnStats { override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal) - override def collectedStatistics: InternalRow = InternalRow(null, null, nullCount, count, 0L) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L)) } private[sql] class BooleanColumnStats extends ColumnStats { @@ -92,8 +93,8 @@ private[sql] class BooleanColumnStats extends ColumnStats { } } - override def collectedStatistics: InternalRow = - InternalRow(lower, upper, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes)) } private[sql] class ByteColumnStats extends ColumnStats { @@ -110,8 +111,8 @@ private[sql] class ByteColumnStats extends ColumnStats { } } - override def collectedStatistics: InternalRow = - InternalRow(lower, upper, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes)) } private[sql] class ShortColumnStats extends ColumnStats { @@ -128,8 +129,8 @@ private[sql] class ShortColumnStats extends ColumnStats { } } - override def collectedStatistics: InternalRow = - InternalRow(lower, upper, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes)) } private[sql] class IntColumnStats extends ColumnStats { @@ -146,8 +147,8 @@ private[sql] class IntColumnStats extends ColumnStats { } } - override def collectedStatistics: InternalRow = - InternalRow(lower, upper, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes)) } private[sql] class LongColumnStats extends ColumnStats { @@ -164,8 +165,8 @@ private[sql] class LongColumnStats extends ColumnStats { } } - override def collectedStatistics: InternalRow = - InternalRow(lower, upper, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes)) } private[sql] class FloatColumnStats extends ColumnStats { @@ -182,8 +183,8 @@ private[sql] class FloatColumnStats extends ColumnStats { } } - override def collectedStatistics: InternalRow = - InternalRow(lower, upper, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes)) } private[sql] class DoubleColumnStats extends ColumnStats { @@ -200,8 +201,8 @@ private[sql] class DoubleColumnStats extends ColumnStats { } } - override def collectedStatistics: InternalRow = - InternalRow(lower, upper, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes)) } private[sql] class StringColumnStats extends ColumnStats { @@ -218,8 +219,8 @@ private[sql] class StringColumnStats extends ColumnStats { } } - override def collectedStatistics: InternalRow = - InternalRow(lower, upper, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes)) } private[sql] class BinaryColumnStats extends ColumnStats { @@ -230,8 +231,8 @@ private[sql] class BinaryColumnStats extends ColumnStats { } } - override def collectedStatistics: InternalRow = - InternalRow(null, null, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](null, null, nullCount, count, sizeInBytes)) } private[sql] class FixedDecimalColumnStats(precision: Int, scale: Int) extends ColumnStats { @@ -248,8 +249,8 @@ private[sql] class FixedDecimalColumnStats(precision: Int, scale: Int) extends C } } - override def collectedStatistics: InternalRow = - InternalRow(lower, upper, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes)) } private[sql] class GenericColumnStats(dataType: DataType) extends ColumnStats { @@ -262,8 +263,8 @@ private[sql] class GenericColumnStats(dataType: DataType) extends ColumnStats { } } - override def collectedStatistics: InternalRow = - InternalRow(null, null, nullCount, count, sizeInBytes) + override def collectedStatistics: GenericInternalRow = + new GenericInternalRow(Array[Any](null, null, nullCount, count, sizeInBytes)) } private[sql] class DateColumnStats extends IntColumnStats diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala index 531a8244d55d..ab482a363612 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala @@ -339,6 +339,8 @@ private[sql] object STRING extends NativeColumnType(StringType, 7, 8) { override def copyField(from: InternalRow, fromOrdinal: Int, to: MutableRow, toOrdinal: Int) { setField(to, toOrdinal, getField(from, fromOrdinal)) } + + override def clone(v: UTF8String): UTF8String = v.clone() } private[sql] object DATE extends NativeColumnType(DateType, 8, 4) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala index 5d5b0697d701..66d429bc0619 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala @@ -120,9 +120,7 @@ private[sql] case class InMemoryRelation( new Iterator[CachedBatch] { def next(): CachedBatch = { val columnBuilders = output.map { attribute => - val columnType = ColumnType(attribute.dataType) - val initialBufferSize = columnType.defaultSize * batchSize - ColumnBuilder(attribute.dataType, initialBufferSize, attribute.name, useCompression) + ColumnBuilder(attribute.dataType, batchSize, attribute.name, useCompression) }.toArray var rowCount = 0 @@ -148,7 +146,7 @@ private[sql] case class InMemoryRelation( } val stats = InternalRow.fromSeq(columnBuilders.map(_.columnStats.collectedStatistics) - .flatMap(_.toSeq)) + .flatMap(_.values)) batchStats += stats CachedBatch(columnBuilders.map(_.build().array()), stats) @@ -330,10 +328,11 @@ private[sql] case class InMemoryColumnarTableScan( if (inMemoryPartitionPruningEnabled) { cachedBatchIterator.filter { cachedBatch => if (!partitionFilter(cachedBatch.stats)) { - def statsString: String = relation.partitionStatistics.schema - .zip(cachedBatch.stats.toSeq) - .map { case (a, s) => s"${a.name}: $s" } - .mkString(", ") + def statsString: String = relation.partitionStatistics.schema.zipWithIndex.map { + case (a, i) => + val value = cachedBatch.stats.get(i, a.dataType) + s"${a.name}: $value" + }.mkString(", ") logInfo(s"Skipping partition based on stats $statsString") false } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala index c91d960a0932..ca910a99db08 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala @@ -270,20 +270,13 @@ private[sql] case object DictionaryEncoding extends CompressionScheme { class Decoder[T <: AtomicType](buffer: ByteBuffer, columnType: NativeColumnType[T]) extends compression.Decoder[T] { - private val dictionary = { - // TODO Can we clean up this mess? Maybe move this to `DataType`? - implicit val classTag = { - val mirror = runtimeMirror(Utils.getSparkClassLoader) - ClassTag[T#InternalType](mirror.runtimeClass(columnType.scalaTag.tpe)) - } - - Array.fill(buffer.getInt()) { - columnType.extract(buffer) - } + private val dictionary: Array[Any] = { + val elementNum = buffer.getInt() + Array.fill[Any](elementNum)(columnType.extract(buffer).asInstanceOf[Any]) } override def next(row: MutableRow, ordinal: Int): Unit = { - columnType.setField(row, ordinal, dictionary(buffer.getShort())) + columnType.setField(row, ordinal, dictionary(buffer.getShort()).asInstanceOf[T#InternalType]) } override def hasNext: Boolean = buffer.hasRemaining diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala index e8c6a0f8f801..f3b6a3a5f4a3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.execution.metric.SQLMetrics /** * :: DeveloperApi :: @@ -45,6 +46,10 @@ case class Aggregate( child: SparkPlan) extends UnaryNode { + override private[sql] lazy val metrics = Map( + "numInputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of input rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + override def requiredChildDistribution: List[Distribution] = { if (partial) { UnspecifiedDistribution :: Nil @@ -121,12 +126,15 @@ case class Aggregate( } protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") { + val numInputRows = longMetric("numInputRows") + val numOutputRows = longMetric("numOutputRows") if (groupingExpressions.isEmpty) { child.execute().mapPartitions { iter => val buffer = newAggregateBuffer() var currentRow: InternalRow = null while (iter.hasNext) { currentRow = iter.next() + numInputRows += 1 var i = 0 while (i < buffer.length) { buffer(i).update(currentRow) @@ -142,6 +150,7 @@ case class Aggregate( i += 1 } + numOutputRows += 1 Iterator(resultProjection(aggregateResults)) } } else { @@ -152,6 +161,7 @@ case class Aggregate( var currentRow: InternalRow = null while (iter.hasNext) { currentRow = iter.next() + numInputRows += 1 val currentGroup = groupingProjection(currentRow) var currentBuffer = hashTable.get(currentGroup) if (currentBuffer == null) { @@ -180,6 +190,7 @@ case class Aggregate( val currentEntry = hashTableIter.next() val currentGroup = currentEntry.getKey val currentBuffer = currentEntry.getValue + numOutputRows += 1 var i = 0 while (i < currentBuffer.length) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala index 05b009d1935b..029f2264a6a2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.util.MutablePair @@ -39,20 +40,26 @@ import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkEn @DeveloperApi case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends UnaryNode { - override def outputPartitioning: Partitioning = newPartitioning + override def nodeName: String = if (tungstenMode) "TungstenExchange" else "Exchange" - override def output: Seq[Attribute] = child.output + /** + * Returns true iff we can support the data type, and we are not doing range partitioning. + */ + private lazy val tungstenMode: Boolean = { + unsafeEnabled && codegenEnabled && GenerateUnsafeProjection.canSupport(child.schema) && + !newPartitioning.isInstanceOf[RangePartitioning] + } - override def outputsUnsafeRows: Boolean = child.outputsUnsafeRows + override def outputPartitioning: Partitioning = newPartitioning - override def canProcessSafeRows: Boolean = true + override def output: Seq[Attribute] = child.output - override def canProcessUnsafeRows: Boolean = { - // Do not use the Unsafe path if we are using a RangePartitioning, since this may lead to - // an interpreted RowOrdering being applied to an UnsafeRow, which will lead to - // ClassCastExceptions at runtime. This check can be removed after SPARK-9054 is fixed. - !newPartitioning.isInstanceOf[RangePartitioning] - } + // This setting is somewhat counterintuitive: + // If the schema works with UnsafeRow, then we tell the planner that we don't support safe row, + // so the planner inserts a converter to convert data into UnsafeRow if needed. + override def outputsUnsafeRows: Boolean = tungstenMode + override def canProcessSafeRows: Boolean = !tungstenMode + override def canProcessUnsafeRows: Boolean = tungstenMode /** * Determines whether records must be defensively copied before being sent to the shuffle. @@ -124,23 +131,9 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una private val serializer: Serializer = { val rowDataTypes = child.output.map(_.dataType).toArray - // It is true when there is no field that needs to be write out. - // For now, we will not use SparkSqlSerializer2 when noField is true. - val noField = rowDataTypes == null || rowDataTypes.length == 0 - - val useSqlSerializer2 = - child.sqlContext.conf.useSqlSerializer2 && // SparkSqlSerializer2 is enabled. - SparkSqlSerializer2.support(rowDataTypes) && // The schema of row is supported. - !noField - - if (child.outputsUnsafeRows) { - logInfo("Using UnsafeRowSerializer.") + if (tungstenMode) { new UnsafeRowSerializer(child.output.size) - } else if (useSqlSerializer2) { - logInfo("Using SparkSqlSerializer2.") - new SparkSqlSerializer2(rowDataTypes) } else { - logInfo("Using SparkSqlSerializer.") new SparkSqlSerializer(sparkConf) } } @@ -156,7 +149,10 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una val mutablePair = new MutablePair[InternalRow, Null]() iter.map(row => mutablePair.update(row.copy(), null)) } - implicit val ordering = new RowOrdering(sortingExpressions, child.output) + // We need to use an interpreted ordering here because generated orderings cannot be + // serialized and this ordering needs to be created on the driver in order to be passed into + // Spark core code. + implicit val ordering = new InterpretedOrdering(sortingExpressions, child.output) new RangePartitioner(numPartitions, rddForSampling, ascending = true) case SinglePartition => new Partitioner { @@ -194,66 +190,72 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una * of input data meets the * [[org.apache.spark.sql.catalyst.plans.physical.Distribution Distribution]] requirements for * each operator by inserting [[Exchange]] Operators where required. Also ensure that the - * required input partition ordering requirements are met. + * input partition ordering requirements are met. */ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[SparkPlan] { // TODO: Determine the number of partitions. - def numPartitions: Int = sqlContext.conf.numShufflePartitions + private def numPartitions: Int = sqlContext.conf.numShufflePartitions - def apply(plan: SparkPlan): SparkPlan = plan.transformUp { - case operator: SparkPlan => - // Adds Exchange or Sort operators as required - def addOperatorsIfNecessary( - partitioning: Partitioning, - rowOrdering: Seq[SortOrder], - child: SparkPlan): SparkPlan = { + /** + * Given a required distribution, returns a partitioning that satisfies that distribution. + */ + private def canonicalPartitioning(requiredDistribution: Distribution): Partitioning = { + requiredDistribution match { + case AllTuples => SinglePartition + case ClusteredDistribution(clustering) => HashPartitioning(clustering, numPartitions) + case OrderedDistribution(ordering) => RangePartitioning(ordering, numPartitions) + case dist => sys.error(s"Do not know how to satisfy distribution $dist") + } + } - def addShuffleIfNecessary(child: SparkPlan): SparkPlan = { - if (!child.outputPartitioning.guarantees(partitioning)) { - Exchange(partitioning, child) - } else { - child - } - } + private def ensureDistributionAndOrdering(operator: SparkPlan): SparkPlan = { + val requiredChildDistributions: Seq[Distribution] = operator.requiredChildDistribution + val requiredChildOrderings: Seq[Seq[SortOrder]] = operator.requiredChildOrdering + var children: Seq[SparkPlan] = operator.children - def addSortIfNecessary(child: SparkPlan): SparkPlan = { + // Ensure that the operator's children satisfy their output distribution requirements: + children = children.zip(requiredChildDistributions).map { case (child, distribution) => + if (child.outputPartitioning.satisfies(distribution)) { + child + } else { + Exchange(canonicalPartitioning(distribution), child) + } + } - if (rowOrdering.nonEmpty) { - // If child.outputOrdering is [a, b] and rowOrdering is [a], we do not need to sort. - val minSize = Seq(rowOrdering.size, child.outputOrdering.size).min - if (minSize == 0 || rowOrdering.take(minSize) != child.outputOrdering.take(minSize)) { - sqlContext.planner.BasicOperators.getSortOperator(rowOrdering, global = false, child) - } else { - child - } - } else { - child - } + // If the operator has multiple children and specifies child output distributions (e.g. join), + // then the children's output partitionings must be compatible: + if (children.length > 1 + && requiredChildDistributions.toSet != Set(UnspecifiedDistribution) + && !Partitioning.allCompatible(children.map(_.outputPartitioning))) { + children = children.zip(requiredChildDistributions).map { case (child, distribution) => + val targetPartitioning = canonicalPartitioning(distribution) + if (child.outputPartitioning.guarantees(targetPartitioning)) { + child + } else { + Exchange(targetPartitioning, child) } - - addSortIfNecessary(addShuffleIfNecessary(child)) } + } - val requirements = - (operator.requiredChildDistribution, operator.requiredChildOrdering, operator.children) - - val fixedChildren = requirements.zipped.map { - case (AllTuples, rowOrdering, child) => - addOperatorsIfNecessary(SinglePartition, rowOrdering, child) - case (ClusteredDistribution(clustering), rowOrdering, child) => - addOperatorsIfNecessary(HashPartitioning(clustering, numPartitions), rowOrdering, child) - case (OrderedDistribution(ordering), rowOrdering, child) => - addOperatorsIfNecessary(RangePartitioning(ordering, numPartitions), rowOrdering, child) - - case (UnspecifiedDistribution, Seq(), child) => + // Now that we've performed any necessary shuffles, add sorts to guarantee output orderings: + children = children.zip(requiredChildOrderings).map { case (child, requiredOrdering) => + if (requiredOrdering.nonEmpty) { + // If child.outputOrdering is [a, b] and requiredOrdering is [a], we do not need to sort. + val minSize = Seq(requiredOrdering.size, child.outputOrdering.size).min + if (minSize == 0 || requiredOrdering.take(minSize) != child.outputOrdering.take(minSize)) { + sqlContext.planner.BasicOperators.getSortOperator(requiredOrdering, global = false, child) + } else { child - case (UnspecifiedDistribution, rowOrdering, child) => - sqlContext.planner.BasicOperators.getSortOperator(rowOrdering, global = false, child) - - case (dist, ordering, _) => - sys.error(s"Don't know how to ensure $dist with ordering $ordering") + } + } else { + child } + } - operator.withNewChildren(fixedChildren) + operator.withNewChildren(children) + } + + def apply(plan: SparkPlan): SparkPlan = plan.transformUp { + case operator: SparkPlan => ensureDistributionAndOrdering(operator) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala index da27a753a710..abb60cf12e3a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} +import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} @@ -95,8 +96,21 @@ private[sql] case class LogicalRDD( /** Physical plan node for scanning data from an RDD. */ private[sql] case class PhysicalRDD( output: Seq[Attribute], - rdd: RDD[InternalRow]) extends LeafNode { + rdd: RDD[InternalRow], + extraInformation: String) extends LeafNode { + protected override def doExecute(): RDD[InternalRow] = rdd + + override def simpleString: String = "Scan " + extraInformation + output.mkString("[", ",", "]") +} + +private[sql] object PhysicalRDD { + def createFromDataSource( + output: Seq[Attribute], + rdd: RDD[InternalRow], + relation: BaseRelation): PhysicalRDD = { + PhysicalRDD(output, rdd, relation.toString) + } } /** Logical plan node for scanning data from a local collection. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala new file mode 100644 index 000000000000..7a2a9eed5807 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +/** + * An interface for relations that are backed by files. When a class implements this interface, + * the list of paths that it returns will be returned to a user who calls `inputPaths` on any + * DataFrame that queries this relation. + */ +private[sql] trait FileRelation { + /** Returns the list of files that will be read when scanning this relation. */ + def inputFiles: Array[String] +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala deleted file mode 100644 index cd87b8deba0c..000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala +++ /dev/null @@ -1,347 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution - -import java.io.IOException - -import org.apache.spark.{SparkEnv, TaskContext} -import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.physical._ -import org.apache.spark.sql.catalyst.trees._ -import org.apache.spark.sql.types._ - -case class AggregateEvaluation( - schema: Seq[Attribute], - initialValues: Seq[Expression], - update: Seq[Expression], - result: Expression) - -/** - * :: DeveloperApi :: - * Alternate version of aggregation that leverages projection and thus code generation. - * Aggregations are converted into a set of projections from a aggregation buffer tuple back onto - * itself. Currently only used for simple aggregations like SUM, COUNT, or AVERAGE are supported. - * - * @param partial if true then aggregation is done partially on local data without shuffling to - * ensure all values where `groupingExpressions` are equal are present. - * @param groupingExpressions expressions that are evaluated to determine grouping. - * @param aggregateExpressions expressions that are computed for each group. - * @param unsafeEnabled whether to allow Unsafe-based aggregation buffers to be used. - * @param child the input data source. - */ -@DeveloperApi -case class GeneratedAggregate( - partial: Boolean, - groupingExpressions: Seq[Expression], - aggregateExpressions: Seq[NamedExpression], - unsafeEnabled: Boolean, - child: SparkPlan) - extends UnaryNode { - - override def requiredChildDistribution: Seq[Distribution] = - if (partial) { - UnspecifiedDistribution :: Nil - } else { - if (groupingExpressions == Nil) { - AllTuples :: Nil - } else { - ClusteredDistribution(groupingExpressions) :: Nil - } - } - - override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute) - - protected override def doExecute(): RDD[InternalRow] = { - val aggregatesToCompute = aggregateExpressions.flatMap { a => - a.collect { case agg: AggregateExpression1 => agg} - } - - // If you add any new function support, please add tests in org.apache.spark.sql.SQLQuerySuite - // (in test "aggregation with codegen"). - val computeFunctions = aggregatesToCompute.map { - case c @ Count(expr) => - // If we're evaluating UnscaledValue(x), we can do Count on x directly, since its - // UnscaledValue will be null if and only if x is null; helps with Average on decimals - val toCount = expr match { - case UnscaledValue(e) => e - case _ => expr - } - val currentCount = AttributeReference("currentCount", LongType, nullable = false)() - val initialValue = Literal(0L) - val updateFunction = If(IsNotNull(toCount), Add(currentCount, Literal(1L)), currentCount) - val result = currentCount - - AggregateEvaluation(currentCount :: Nil, initialValue :: Nil, updateFunction :: Nil, result) - - case s @ Sum(expr) => - val calcType = - expr.dataType match { - case DecimalType.Fixed(p, s) => - DecimalType.bounded(p + 10, s) - case _ => - expr.dataType - } - - val currentSum = AttributeReference("currentSum", calcType, nullable = true)() - val initialValue = Literal.create(null, calcType) - - // Coalesce avoids double calculation... - // but really, common sub expression elimination would be better.... - val zero = Cast(Literal(0), calcType) - val updateFunction = Coalesce( - Add( - Coalesce(currentSum :: zero :: Nil), - Cast(expr, calcType) - ) :: currentSum :: Nil) - val result = - expr.dataType match { - case DecimalType.Fixed(_, _) => - Cast(currentSum, s.dataType) - case _ => currentSum - } - - AggregateEvaluation(currentSum :: Nil, initialValue :: Nil, updateFunction :: Nil, result) - - case m @ Max(expr) => - val currentMax = AttributeReference("currentMax", expr.dataType, nullable = true)() - val initialValue = Literal.create(null, expr.dataType) - val updateMax = MaxOf(currentMax, expr) - - AggregateEvaluation( - currentMax :: Nil, - initialValue :: Nil, - updateMax :: Nil, - currentMax) - - case m @ Min(expr) => - val currentMin = AttributeReference("currentMin", expr.dataType, nullable = true)() - val initialValue = Literal.create(null, expr.dataType) - val updateMin = MinOf(currentMin, expr) - - AggregateEvaluation( - currentMin :: Nil, - initialValue :: Nil, - updateMin :: Nil, - currentMin) - - case CollectHashSet(Seq(expr)) => - val set = - AttributeReference("hashSet", new OpenHashSetUDT(expr.dataType), nullable = false)() - val initialValue = NewSet(expr.dataType) - val addToSet = AddItemToSet(expr, set) - - AggregateEvaluation( - set :: Nil, - initialValue :: Nil, - addToSet :: Nil, - set) - - case CombineSetsAndCount(inputSet) => - val elementType = inputSet.dataType.asInstanceOf[OpenHashSetUDT].elementType - val set = - AttributeReference("hashSet", new OpenHashSetUDT(elementType), nullable = false)() - val initialValue = NewSet(elementType) - val collectSets = CombineSets(set, inputSet) - - AggregateEvaluation( - set :: Nil, - initialValue :: Nil, - collectSets :: Nil, - CountSet(set)) - - case o => sys.error(s"$o can't be codegened.") - } - - val computationSchema = computeFunctions.flatMap(_.schema) - - val resultMap: Map[TreeNodeRef, Expression] = - aggregatesToCompute.zip(computeFunctions).map { - case (agg, func) => new TreeNodeRef(agg) -> func.result - }.toMap - - val namedGroups = groupingExpressions.zipWithIndex.map { - case (ne: NamedExpression, _) => (ne, ne.toAttribute) - case (e, i) => (e, Alias(e, s"GroupingExpr$i")().toAttribute) - } - - // The set of expressions that produce the final output given the aggregation buffer and the - // grouping expressions. - val resultExpressions = aggregateExpressions.map(_.transform { - case e: Expression if resultMap.contains(new TreeNodeRef(e)) => resultMap(new TreeNodeRef(e)) - case e: Expression => - namedGroups.collectFirst { - case (expr, attr) if expr semanticEquals e => attr - }.getOrElse(e) - }) - - val aggregationBufferSchema: StructType = StructType.fromAttributes(computationSchema) - - val groupKeySchema: StructType = { - val fields = groupingExpressions.zipWithIndex.map { case (expr, idx) => - // This is a dummy field name - StructField(idx.toString, expr.dataType, expr.nullable) - } - StructType(fields) - } - - val schemaSupportsUnsafe: Boolean = { - UnsafeFixedWidthAggregationMap.supportsAggregationBufferSchema(aggregationBufferSchema) && - UnsafeProjection.canSupport(groupKeySchema) - } - - child.execute().mapPartitions { iter => - // Builds a new custom class for holding the results of aggregation for a group. - val initialValues = computeFunctions.flatMap(_.initialValues) - val newAggregationBuffer = newProjection(initialValues, child.output) - log.info(s"Initial values: ${initialValues.mkString(",")}") - - // A projection that computes the group given an input tuple. - val groupProjection = newProjection(groupingExpressions, child.output) - log.info(s"Grouping Projection: ${groupingExpressions.mkString(",")}") - - // A projection that is used to update the aggregate values for a group given a new tuple. - // This projection should be targeted at the current values for the group and then applied - // to a joined row of the current values with the new input row. - val updateExpressions = computeFunctions.flatMap(_.update) - val updateSchema = computeFunctions.flatMap(_.schema) ++ child.output - val updateProjection = newMutableProjection(updateExpressions, updateSchema)() - log.info(s"Update Expressions: ${updateExpressions.mkString(",")}") - - // A projection that produces the final result, given a computation. - val resultProjectionBuilder = - newMutableProjection( - resultExpressions, - namedGroups.map(_._2) ++ computationSchema) - log.info(s"Result Projection: ${resultExpressions.mkString(",")}") - - val joinedRow = new JoinedRow - - if (!iter.hasNext) { - // This is an empty input, so return early so that we do not allocate data structures - // that won't be cleaned up (see SPARK-8357). - if (groupingExpressions.isEmpty) { - // This is a global aggregate, so return an empty aggregation buffer. - val resultProjection = resultProjectionBuilder() - Iterator(resultProjection(newAggregationBuffer(EmptyRow))) - } else { - // This is a grouped aggregate, so return an empty iterator. - Iterator[InternalRow]() - } - } else if (groupingExpressions.isEmpty) { - // TODO: Codegening anything other than the updateProjection is probably over kill. - val buffer = newAggregationBuffer(EmptyRow).asInstanceOf[MutableRow] - var currentRow: InternalRow = null - updateProjection.target(buffer) - - while (iter.hasNext) { - currentRow = iter.next() - updateProjection(joinedRow(buffer, currentRow)) - } - - val resultProjection = resultProjectionBuilder() - Iterator(resultProjection(buffer)) - - } else if (unsafeEnabled && schemaSupportsUnsafe) { - assert(iter.hasNext, "There should be at least one row for this path") - log.info("Using Unsafe-based aggregator") - val pageSizeBytes = SparkEnv.get.conf.getSizeAsBytes("spark.buffer.pageSize", "64m") - val aggregationMap = new UnsafeFixedWidthAggregationMap( - newAggregationBuffer(EmptyRow), - aggregationBufferSchema, - groupKeySchema, - TaskContext.get.taskMemoryManager(), - SparkEnv.get.shuffleMemoryManager, - 1024 * 16, // initial capacity - pageSizeBytes, - false // disable tracking of performance metrics - ) - - while (iter.hasNext) { - val currentRow: InternalRow = iter.next() - val groupKey: InternalRow = groupProjection(currentRow) - val aggregationBuffer = aggregationMap.getAggregationBuffer(groupKey) - if (aggregationBuffer == null) { - throw new IOException("Could not allocate memory to grow aggregation buffer") - } - updateProjection.target(aggregationBuffer)(joinedRow(aggregationBuffer, currentRow)) - } - - new Iterator[InternalRow] { - private[this] val mapIterator = aggregationMap.iterator() - private[this] val resultProjection = resultProjectionBuilder() - private[this] var _hasNext = mapIterator.next() - - def hasNext: Boolean = _hasNext - - def next(): InternalRow = { - if (_hasNext) { - val result = resultProjection(joinedRow(mapIterator.getKey, mapIterator.getValue)) - _hasNext = mapIterator.next() - if (_hasNext) { - result - } else { - // This is the last element in the iterator, so let's free the buffer. Before we do, - // though, we need to make a defensive copy of the result so that we don't return an - // object that might contain dangling pointers to the freed memory - val resultCopy = result.copy() - aggregationMap.free() - resultCopy - } - } else { - throw new java.util.NoSuchElementException - } - } - } - } else { - if (unsafeEnabled) { - log.info("Not using Unsafe-based aggregator because it is not supported for this schema") - } - val buffers = new java.util.HashMap[InternalRow, MutableRow]() - - var currentRow: InternalRow = null - while (iter.hasNext) { - currentRow = iter.next() - val currentGroup = groupProjection(currentRow) - var currentBuffer = buffers.get(currentGroup) - if (currentBuffer == null) { - currentBuffer = newAggregationBuffer(EmptyRow).asInstanceOf[MutableRow] - buffers.put(currentGroup, currentBuffer) - } - // Target the projection at the current aggregation buffer and then project the updated - // values. - updateProjection.target(currentBuffer)(joinedRow(currentBuffer, currentRow)) - } - - new Iterator[InternalRow] { - private[this] val resultIterator = buffers.entrySet.iterator() - private[this] val resultProjection = resultProjectionBuilder() - - def hasNext: Boolean = resultIterator.hasNext - - def next(): InternalRow = { - val currentGroup = resultIterator.next() - resultProjection(joinedRow(currentGroup.getKey, currentGroup.getValue)) - } - } - } - } - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/RowIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/RowIterator.scala new file mode 100644 index 000000000000..7462dbc4eba3 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/RowIterator.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import java.util.NoSuchElementException + +import org.apache.spark.sql.catalyst.InternalRow + +/** + * An internal iterator interface which presents a more restrictive API than + * [[scala.collection.Iterator]]. + * + * One major departure from the Scala iterator API is the fusing of the `hasNext()` and `next()` + * calls: Scala's iterator allows users to call `hasNext()` without immediately advancing the + * iterator to consume the next row, whereas RowIterator combines these calls into a single + * [[advanceNext()]] method. + */ +private[sql] abstract class RowIterator { + /** + * Advance this iterator by a single row. Returns `false` if this iterator has no more rows + * and `true` otherwise. If this returns `true`, then the new row can be retrieved by calling + * [[getRow]]. + */ + def advanceNext(): Boolean + + /** + * Retrieve the row from this iterator. This method is idempotent. It is illegal to call this + * method after [[advanceNext()]] has returned `false`. + */ + def getRow: InternalRow + + /** + * Convert this RowIterator into a [[scala.collection.Iterator]]. + */ + def toScala: Iterator[InternalRow] = new RowIteratorToScala(this) +} + +object RowIterator { + def fromScala(scalaIter: Iterator[InternalRow]): RowIterator = { + scalaIter match { + case wrappedRowIter: RowIteratorToScala => wrappedRowIter.rowIter + case _ => new RowIteratorFromScala(scalaIter) + } + } +} + +private final class RowIteratorToScala(val rowIter: RowIterator) extends Iterator[InternalRow] { + private [this] var hasNextWasCalled: Boolean = false + private [this] var _hasNext: Boolean = false + override def hasNext: Boolean = { + // Idempotency: + if (!hasNextWasCalled) { + _hasNext = rowIter.advanceNext() + hasNextWasCalled = true + } + _hasNext + } + override def next(): InternalRow = { + if (!hasNext) throw new NoSuchElementException + hasNextWasCalled = false + rowIter.getRow + } +} + +private final class RowIteratorFromScala(scalaIter: Iterator[InternalRow]) extends RowIterator { + private[this] var _next: InternalRow = null + override def advanceNext(): Boolean = { + if (scalaIter.hasNext) { + _next = scalaIter.next() + true + } else { + _next = null + false + } + } + override def getRow: InternalRow = _next + override def toScala: Iterator[InternalRow] = scalaIter +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala new file mode 100644 index 000000000000..cee58218a885 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import java.util.concurrent.atomic.AtomicLong + +import org.apache.spark.SparkContext +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.execution.ui.SparkPlanGraph +import org.apache.spark.util.Utils + +private[sql] object SQLExecution { + + val EXECUTION_ID_KEY = "spark.sql.execution.id" + + private val _nextExecutionId = new AtomicLong(0) + + private def nextExecutionId: Long = _nextExecutionId.getAndIncrement + + /** + * Wrap an action that will execute "queryExecution" to track all Spark jobs in the body so that + * we can connect them with an execution. + */ + def withNewExecutionId[T]( + sqlContext: SQLContext, queryExecution: SQLContext#QueryExecution)(body: => T): T = { + val sc = sqlContext.sparkContext + val oldExecutionId = sc.getLocalProperty(EXECUTION_ID_KEY) + if (oldExecutionId == null) { + val executionId = SQLExecution.nextExecutionId + sc.setLocalProperty(EXECUTION_ID_KEY, executionId.toString) + val r = try { + val callSite = Utils.getCallSite() + sqlContext.listener.onExecutionStart( + executionId, + callSite.shortForm, + callSite.longForm, + queryExecution.toString, + SparkPlanGraph(queryExecution.executedPlan), + System.currentTimeMillis()) + try { + body + } finally { + // Ideally, we need to make sure onExecutionEnd happens after onJobStart and onJobEnd. + // However, onJobStart and onJobEnd run in the listener thread. Because we cannot add new + // SQL event types to SparkListener since it's a public API, we cannot guarantee that. + // + // SQLListener should handle the case that onExecutionEnd happens before onJobEnd. + // + // The worst case is onExecutionEnd may happen before onJobStart when the listener thread + // is very busy. If so, we cannot track the jobs for the execution. It seems acceptable. + sqlContext.listener.onExecutionEnd(executionId, System.currentTimeMillis()) + } + } finally { + sc.setLocalProperty(EXECUTION_ID_KEY, null) + } + r + } else { + // Don't support nested `withNewExecutionId`. This is an example of the nested + // `withNewExecutionId`: + // + // class DataFrame { + // def foo: T = withNewExecutionId { something.createNewDataFrame().collect() } + // } + // + // Note: `collect` will call withNewExecutionId + // In this case, only the "executedPlan" for "collect" will be executed. The "executedPlan" + // for the outer DataFrame won't be executed. So it's meaningless to create a new Execution + // for the outer DataFrame. Even if we track it, since its "executedPlan" doesn't run, + // all accumulator metrics will be 0. It will confuse people if we show them in Web UI. + // + // A real case is the `DataFrame.count` method. + throw new IllegalArgumentException(s"$EXECUTION_ID_KEY is already set") + } + } + + /** + * Wrap an action with a known executionId. When running a different action in a different + * thread from the original one, this method can be used to connect the Spark jobs in this action + * with the known executionId, e.g., `BroadcastHashJoin.broadcastFuture`. + */ + def withExecutionId[T](sc: SparkContext, executionId: String)(body: => T): T = { + val oldExecutionId = sc.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + try { + sc.setLocalProperty(SQLExecution.EXECUTION_ID_KEY, executionId) + body + } finally { + sc.setLocalProperty(SQLExecution.EXECUTION_ID_KEY, oldExecutionId) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala index 49adf215379c..e17b50edc62d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala @@ -38,6 +38,8 @@ object SortPrefixUtils { sortOrder.dataType match { case StringType => if (sortOrder.isAscending) PrefixComparators.STRING else PrefixComparators.STRING_DESC + case BinaryType => + if (sortOrder.isAscending) PrefixComparators.BINARY else PrefixComparators.BINARY_DESC case BooleanType | ByteType | ShortType | IntegerType | LongType | DateType | TimestampType => if (sortOrder.isAscending) PrefixComparators.LONG else PrefixComparators.LONG_DESC case dt: DecimalType if dt.precision - dt.scale <= Decimal.MAX_LONG_DIGITS => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala index 50c27def8ea5..72f5450510a1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution +import java.util.concurrent.atomic.AtomicBoolean + import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging @@ -30,6 +32,8 @@ import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetric, SQLMetrics} +import org.apache.spark.sql.types.DataType object SparkPlan { protected[sql] val currentContext = new ThreadLocal[SQLContext]() @@ -52,19 +56,41 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ protected def sparkContext = sqlContext.sparkContext // sqlContext will be null when we are being deserialized on the slaves. In this instance - // the value of codegenEnabled will be set by the desserializer after the constructor has run. + // the value of codegenEnabled/unsafeEnabled will be set by the desserializer after the + // constructor has run. val codegenEnabled: Boolean = if (sqlContext != null) { sqlContext.conf.codegenEnabled } else { false } + val unsafeEnabled: Boolean = if (sqlContext != null) { + sqlContext.conf.unsafeEnabled + } else { + false + } + + /** + * Whether the "prepare" method is called. + */ + private val prepareCalled = new AtomicBoolean(false) /** Overridden make copy also propogates sqlContext to copied plan. */ - override def makeCopy(newArgs: Array[AnyRef]): this.type = { + override def makeCopy(newArgs: Array[AnyRef]): SparkPlan = { SparkPlan.currentContext.set(sqlContext) super.makeCopy(newArgs) } + /** + * Return all metrics containing metrics of this SparkPlan. + */ + private[sql] def metrics: Map[String, SQLMetric[_, _]] = Map.empty + + /** + * Return a LongSQLMetric according to the name. + */ + private[sql] def longMetric(name: String): LongSQLMetric = + metrics(name).asInstanceOf[LongSQLMetric] + // TODO: Move to `DistributedPlan` /** Specifies how data is partitioned across different nodes in the cluster. */ def outputPartitioning: Partitioning = UnknownPartitioning(0) // TODO: WRONG WIDTH! @@ -110,10 +136,31 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ "Operator will receive unsafe rows as input but cannot process unsafe rows") } RDDOperationScope.withScope(sparkContext, nodeName, false, true) { + prepare() doExecute() } } + /** + * Prepare a SparkPlan for execution. It's idempotent. + */ + final def prepare(): Unit = { + if (prepareCalled.compareAndSet(false, true)) { + doPrepare() + children.foreach(_.prepare()) + } + } + + /** + * Overridden by concrete implementations of SparkPlan. It is guaranteed to run before any + * `execute` of SparkPlan. This is helpful if we want to set up some state before executing the + * query, e.g., `BroadcastHashJoin` uses it to broadcast asynchronously. + * + * Note: the prepare method has already walked down the tree, so the implementation doesn't need + * to call children's prepare methods. + */ + protected def doPrepare(): Unit = {} + /** * Overridden by concrete implementations of SparkPlan. * Produces the result of the query as an RDD[InternalRow] @@ -251,12 +298,21 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ throw e } else { log.error("Failed to generate ordering, fallback to interpreted", e) - new RowOrdering(order, inputSchema) + new InterpretedOrdering(order, inputSchema) } } } else { - new RowOrdering(order, inputSchema) + new InterpretedOrdering(order, inputSchema) + } + } + /** + * Creates a row ordering for the given schema, in natural ascending order. + */ + protected def newNaturalAscendingOrdering(dataTypes: Seq[DataType]): Ordering[InternalRow] = { + val order: Seq[SortOrder] = dataTypes.zipWithIndex.map { + case (dt, index) => new SortOrder(BoundReference(index, dt, nullable = true), Ascending) } + newOrdering(order, Seq.empty) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala deleted file mode 100644 index e5bbd0aaed0a..000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala +++ /dev/null @@ -1,425 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution - -import java.io._ -import java.math.{BigDecimal, BigInteger} -import java.nio.ByteBuffer - -import scala.reflect.ClassTag - -import org.apache.spark.Logging -import org.apache.spark.serializer._ -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{MutableRow, SpecificMutableRow} -import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String - -/** - * The serialization stream for [[SparkSqlSerializer2]]. It assumes that the object passed in - * its `writeObject` are [[Product2]]. The serialization functions for the key and value of the - * [[Product2]] are constructed based on their schemata. - * The benefit of this serialization stream is that compared with general-purpose serializers like - * Kryo and Java serializer, it can significantly reduce the size of serialized and has a lower - * allocation cost, which can benefit the shuffle operation. Right now, its main limitations are: - * 1. It does not support complex types, i.e. Map, Array, and Struct. - * 2. It assumes that the objects passed in are [[Product2]]. So, it cannot be used when - * [[org.apache.spark.util.collection.ExternalSorter]]'s merge sort operation is used because - * the objects passed in the serializer are not in the type of [[Product2]]. Also also see - * the comment of the `serializer` method in [[Exchange]] for more information on it. - */ -private[sql] class Serializer2SerializationStream( - rowSchema: Array[DataType], - out: OutputStream) - extends SerializationStream with Logging { - - private val rowOut = new DataOutputStream(new BufferedOutputStream(out)) - private val writeRowFunc = SparkSqlSerializer2.createSerializationFunction(rowSchema, rowOut) - - override def writeObject[T: ClassTag](t: T): SerializationStream = { - val kv = t.asInstanceOf[Product2[InternalRow, InternalRow]] - writeKey(kv._1) - writeValue(kv._2) - - this - } - - override def writeKey[T: ClassTag](t: T): SerializationStream = { - // No-op. - this - } - - override def writeValue[T: ClassTag](t: T): SerializationStream = { - writeRowFunc(t.asInstanceOf[InternalRow]) - this - } - - def flush(): Unit = { - rowOut.flush() - } - - def close(): Unit = { - rowOut.close() - } -} - -/** - * The corresponding deserialization stream for [[Serializer2SerializationStream]]. - */ -private[sql] class Serializer2DeserializationStream( - rowSchema: Array[DataType], - in: InputStream) - extends DeserializationStream with Logging { - - private val rowIn = new DataInputStream(new BufferedInputStream(in)) - - private def rowGenerator(schema: Array[DataType]): () => (MutableRow) = { - if (schema == null) { - () => null - } else { - // It is safe to reuse the mutable row. - val mutableRow = new SpecificMutableRow(schema) - () => mutableRow - } - } - - // Functions used to return rows for key and value. - private val getRow = rowGenerator(rowSchema) - // Functions used to read a serialized row from the InputStream and deserialize it. - private val readRowFunc = SparkSqlSerializer2.createDeserializationFunction(rowSchema, rowIn) - - override def readObject[T: ClassTag](): T = { - readValue() - } - - override def readKey[T: ClassTag](): T = { - null.asInstanceOf[T] // intentionally left blank. - } - - override def readValue[T: ClassTag](): T = { - readRowFunc(getRow()).asInstanceOf[T] - } - - override def close(): Unit = { - rowIn.close() - } -} - -private[sql] class SparkSqlSerializer2Instance( - rowSchema: Array[DataType]) - extends SerializerInstance { - - def serialize[T: ClassTag](t: T): ByteBuffer = - throw new UnsupportedOperationException("Not supported.") - - def deserialize[T: ClassTag](bytes: ByteBuffer): T = - throw new UnsupportedOperationException("Not supported.") - - def deserialize[T: ClassTag](bytes: ByteBuffer, loader: ClassLoader): T = - throw new UnsupportedOperationException("Not supported.") - - def serializeStream(s: OutputStream): SerializationStream = { - new Serializer2SerializationStream(rowSchema, s) - } - - def deserializeStream(s: InputStream): DeserializationStream = { - new Serializer2DeserializationStream(rowSchema, s) - } -} - -/** - * SparkSqlSerializer2 is a special serializer that creates serialization function and - * deserialization function based on the schema of data. It assumes that values passed in - * are Rows. - */ -private[sql] class SparkSqlSerializer2(rowSchema: Array[DataType]) - extends Serializer - with Logging - with Serializable{ - - def newInstance(): SerializerInstance = new SparkSqlSerializer2Instance(rowSchema) - - override def supportsRelocationOfSerializedObjects: Boolean = { - // SparkSqlSerializer2 is stateless and writes no stream headers - true - } -} - -private[sql] object SparkSqlSerializer2 { - - final val NULL = 0 - final val NOT_NULL = 1 - - /** - * Check if rows with the given schema can be serialized with ShuffleSerializer. - * Right now, we do not support a schema having complex types or UDTs, or all data types - * of fields are NullTypes. - */ - def support(schema: Array[DataType]): Boolean = { - if (schema == null) return true - - var allNullTypes = true - var i = 0 - while (i < schema.length) { - schema(i) match { - case NullType => // Do nothing - case udt: UserDefinedType[_] => - allNullTypes = false - return false - case array: ArrayType => - allNullTypes = false - return false - case map: MapType => - allNullTypes = false - return false - case struct: StructType => - allNullTypes = false - return false - case _ => - allNullTypes = false - } - i += 1 - } - - // If types of fields are all NullTypes, we return false. - // Otherwise, we return true. - return !allNullTypes - } - - /** - * The util function to create the serialization function based on the given schema. - */ - def createSerializationFunction(schema: Array[DataType], out: DataOutputStream) - : InternalRow => Unit = { - (row: InternalRow) => - // If the schema is null, the returned function does nothing when it get called. - if (schema != null) { - var i = 0 - while (i < schema.length) { - schema(i) match { - // When we write values to the underlying stream, we also first write the null byte - // first. Then, if the value is not null, we write the contents out. - - case NullType => // Write nothing. - - case BooleanType => - if (row.isNullAt(i)) { - out.writeByte(NULL) - } else { - out.writeByte(NOT_NULL) - out.writeBoolean(row.getBoolean(i)) - } - - case ByteType => - if (row.isNullAt(i)) { - out.writeByte(NULL) - } else { - out.writeByte(NOT_NULL) - out.writeByte(row.getByte(i)) - } - - case ShortType => - if (row.isNullAt(i)) { - out.writeByte(NULL) - } else { - out.writeByte(NOT_NULL) - out.writeShort(row.getShort(i)) - } - - case IntegerType | DateType => - if (row.isNullAt(i)) { - out.writeByte(NULL) - } else { - out.writeByte(NOT_NULL) - out.writeInt(row.getInt(i)) - } - - case LongType | TimestampType => - if (row.isNullAt(i)) { - out.writeByte(NULL) - } else { - out.writeByte(NOT_NULL) - out.writeLong(row.getLong(i)) - } - - case FloatType => - if (row.isNullAt(i)) { - out.writeByte(NULL) - } else { - out.writeByte(NOT_NULL) - out.writeFloat(row.getFloat(i)) - } - - case DoubleType => - if (row.isNullAt(i)) { - out.writeByte(NULL) - } else { - out.writeByte(NOT_NULL) - out.writeDouble(row.getDouble(i)) - } - - case StringType => - if (row.isNullAt(i)) { - out.writeByte(NULL) - } else { - out.writeByte(NOT_NULL) - val bytes = row.getUTF8String(i).getBytes - out.writeInt(bytes.length) - out.write(bytes) - } - - case BinaryType => - if (row.isNullAt(i)) { - out.writeByte(NULL) - } else { - out.writeByte(NOT_NULL) - val bytes = row.getBinary(i) - out.writeInt(bytes.length) - out.write(bytes) - } - - case decimal: DecimalType => - if (row.isNullAt(i)) { - out.writeByte(NULL) - } else { - out.writeByte(NOT_NULL) - val value = row.getDecimal(i, decimal.precision, decimal.scale) - val javaBigDecimal = value.toJavaBigDecimal - // First, write out the unscaled value. - val bytes: Array[Byte] = javaBigDecimal.unscaledValue().toByteArray - out.writeInt(bytes.length) - out.write(bytes) - // Then, write out the scale. - out.writeInt(javaBigDecimal.scale()) - } - } - i += 1 - } - } - } - - /** - * The util function to create the deserialization function based on the given schema. - */ - def createDeserializationFunction( - schema: Array[DataType], - in: DataInputStream): (MutableRow) => InternalRow = { - if (schema == null) { - (mutableRow: MutableRow) => null - } else { - (mutableRow: MutableRow) => { - var i = 0 - while (i < schema.length) { - schema(i) match { - // When we read values from the underlying stream, we also first read the null byte - // first. Then, if the value is not null, we update the field of the mutable row. - - case NullType => mutableRow.setNullAt(i) // Read nothing. - - case BooleanType => - if (in.readByte() == NULL) { - mutableRow.setNullAt(i) - } else { - mutableRow.setBoolean(i, in.readBoolean()) - } - - case ByteType => - if (in.readByte() == NULL) { - mutableRow.setNullAt(i) - } else { - mutableRow.setByte(i, in.readByte()) - } - - case ShortType => - if (in.readByte() == NULL) { - mutableRow.setNullAt(i) - } else { - mutableRow.setShort(i, in.readShort()) - } - - case IntegerType | DateType => - if (in.readByte() == NULL) { - mutableRow.setNullAt(i) - } else { - mutableRow.setInt(i, in.readInt()) - } - - case LongType | TimestampType => - if (in.readByte() == NULL) { - mutableRow.setNullAt(i) - } else { - mutableRow.setLong(i, in.readLong()) - } - - case FloatType => - if (in.readByte() == NULL) { - mutableRow.setNullAt(i) - } else { - mutableRow.setFloat(i, in.readFloat()) - } - - case DoubleType => - if (in.readByte() == NULL) { - mutableRow.setNullAt(i) - } else { - mutableRow.setDouble(i, in.readDouble()) - } - - case StringType => - if (in.readByte() == NULL) { - mutableRow.setNullAt(i) - } else { - val length = in.readInt() - val bytes = new Array[Byte](length) - in.readFully(bytes) - mutableRow.update(i, UTF8String.fromBytes(bytes)) - } - - case BinaryType => - if (in.readByte() == NULL) { - mutableRow.setNullAt(i) - } else { - val length = in.readInt() - val bytes = new Array[Byte](length) - in.readFully(bytes) - mutableRow.update(i, bytes) - } - - case decimal: DecimalType => - if (in.readByte() == NULL) { - mutableRow.setNullAt(i) - } else { - // First, read in the unscaled value. - val length = in.readInt() - val bytes = new Array[Byte](length) - in.readFully(bytes) - val unscaledVal = new BigInteger(bytes) - // Then, read the scale. - val scale = in.readInt() - // Finally, create the Decimal object and set it in the row. - mutableRow.update(i, Decimal(new BigDecimal(unscaledVal, scale))) - } - } - i += 1 - } - - mutableRow - } - } - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 952ba7d45c13..4df53687a073 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -63,19 +63,23 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { } /** - * Uses the ExtractEquiJoinKeys pattern to find joins where at least some of the predicates can be - * evaluated by matching hash keys. + * Uses the [[ExtractEquiJoinKeys]] pattern to find joins where at least some of the predicates + * can be evaluated by matching join keys. * - * This strategy applies a simple optimization based on the estimates of the physical sizes of - * the two join sides. When planning a [[joins.BroadcastHashJoin]], if one side has an - * estimated physical size smaller than the user-settable threshold - * [[org.apache.spark.sql.SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]], the planner would mark it as the - * ''build'' relation and mark the other relation as the ''stream'' side. The build table will be - * ''broadcasted'' to all of the executors involved in the join, as a - * [[org.apache.spark.broadcast.Broadcast]] object. If both estimates exceed the threshold, they - * will instead be used to decide the build side in a [[joins.ShuffledHashJoin]]. + * Join implementations are chosen with the following precedence: + * + * - Broadcast: if one side of the join has an estimated physical size that is smaller than the + * user-configurable [[org.apache.spark.sql.SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]] threshold + * or if that side has an explicit broadcast hint (e.g. the user applied the + * [[org.apache.spark.sql.functions.broadcast()]] function to a DataFrame), then that side + * of the join will be broadcasted and the other side will be streamed, with no shuffling + * performed. If both sides of the join are eligible to be broadcasted then the + * - Sort merge: if the matching join keys are sortable and + * [[org.apache.spark.sql.SQLConf.SORTMERGE_JOIN]] is enabled (default), then sort merge join + * will be used. + * - Hash: will be chosen if neither of the above optimizations apply to this join. */ - object HashJoin extends Strategy with PredicateHelper { + object EquiJoinSelection extends Strategy with PredicateHelper { private[this] def makeBroadcastHashJoin( leftKeys: Seq[Expression], @@ -90,14 +94,15 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { } def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + + // --- Inner joins -------------------------------------------------------------------------- + case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, CanBroadcast(right)) => makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, joins.BuildRight) case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, CanBroadcast(left), right) => makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, joins.BuildLeft) - // If the sort merge join option is set, we want to use sort merge join prior to hashjoin - // for now let's support inner join first, then add outer join case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right) if sqlContext.conf.sortMergeJoinEnabled && RowOrdering.isOrderable(leftKeys) => val mergeJoin = @@ -115,6 +120,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { leftKeys, rightKeys, buildSide, planLater(left), planLater(right)) condition.map(Filter(_, hashJoin)).getOrElse(hashJoin) :: Nil + // --- Outer joins -------------------------------------------------------------------------- + case ExtractEquiJoinKeys( LeftOuter, leftKeys, rightKeys, condition, left, CanBroadcast(right)) => joins.BroadcastHashOuterJoin( @@ -125,10 +132,22 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { joins.BroadcastHashOuterJoin( leftKeys, rightKeys, RightOuter, condition, planLater(left), planLater(right)) :: Nil + case ExtractEquiJoinKeys(LeftOuter, leftKeys, rightKeys, condition, left, right) + if sqlContext.conf.sortMergeJoinEnabled && RowOrdering.isOrderable(leftKeys) => + joins.SortMergeOuterJoin( + leftKeys, rightKeys, LeftOuter, condition, planLater(left), planLater(right)) :: Nil + + case ExtractEquiJoinKeys(RightOuter, leftKeys, rightKeys, condition, left, right) + if sqlContext.conf.sortMergeJoinEnabled && RowOrdering.isOrderable(leftKeys) => + joins.SortMergeOuterJoin( + leftKeys, rightKeys, RightOuter, condition, planLater(left), planLater(right)) :: Nil + case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right) => joins.ShuffledHashOuterJoin( leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)) :: Nil + // --- Cases where this strategy does not apply --------------------------------------------- + case _ => Nil } } @@ -136,32 +155,6 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { object HashAggregation extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { // Aggregations that can be performed in two phases, before and after the shuffle. - - // Cases where all aggregates can be codegened. - case PartialAggregation( - namedGroupingAttributes, - rewrittenAggregateExpressions, - groupingExpressions, - partialComputation, - child) - if canBeCodeGened( - allAggregates(partialComputation) ++ - allAggregates(rewrittenAggregateExpressions)) && - codegenEnabled && - !canBeConvertedToNewAggregation(plan) => - execution.GeneratedAggregate( - partial = false, - namedGroupingAttributes, - rewrittenAggregateExpressions, - unsafeEnabled, - execution.GeneratedAggregate( - partial = true, - groupingExpressions, - partialComputation, - unsafeEnabled, - planLater(child))) :: Nil - - // Cases where some aggregate can not be codegened case PartialAggregation( namedGroupingAttributes, rewrittenAggregateExpressions, @@ -192,14 +185,6 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case _ => false } - def canBeCodeGened(aggs: Seq[AggregateExpression1]): Boolean = aggs.forall { - case _: Sum | _: Count | _: Max | _: Min | _: CombineSetsAndCount => true - // The generated set implementation is pretty limited ATM. - case CollectHashSet(exprs) if exprs.size == 1 && - Seq(IntegerType, LongType).contains(exprs.head.dataType) => true - case _ => false - } - def allAggregates(exprs: Seq[Expression]): Seq[AggregateExpression1] = exprs.flatMap(_.collect { case a: AggregateExpression1 => a }) } @@ -225,8 +210,9 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { // aggregate function to the corresponding attribute of the function. val aggregateFunctionMap = aggregateExpressions.map { agg => val aggregateFunction = agg.aggregateFunction + val attribtue = Alias(aggregateFunction, aggregateFunction.toString)().toAttribute (aggregateFunction, agg.isDistinct) -> - Alias(aggregateFunction, aggregateFunction.toString)().toAttribute + (aggregateFunction -> attribtue) }.toMap val (functionsWithDistinct, functionsWithoutDistinct) = @@ -396,12 +382,12 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { execution.Generate( generator, join = join, outer = outer, g.output, planLater(child)) :: Nil case logical.OneRowRelation => - execution.PhysicalRDD(Nil, singleRowRdd) :: Nil + execution.PhysicalRDD(Nil, singleRowRdd, "OneRowRelation") :: Nil case logical.RepartitionByExpression(expressions, child) => execution.Exchange(HashPartitioning(expressions, numPartitions), planLater(child)) :: Nil case e @ EvaluatePython(udf, child, _) => BatchPythonEvaluation(udf, e.output, planLater(child)) :: Nil - case LogicalRDD(output, rdd) => PhysicalRDD(output, rdd) :: Nil + case LogicalRDD(output, rdd) => PhysicalRDD(output, rdd, "PhysicalRDD") :: Nil case BroadcastHint(child) => apply(child) case _ => Nil } @@ -409,22 +395,22 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { object DDLStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case CreateTableUsing(tableName, userSpecifiedSchema, provider, true, opts, false, _) => + case CreateTableUsing(tableIdent, userSpecifiedSchema, provider, true, opts, false, _) => ExecutedCommand( CreateTempTableUsing( - tableName, userSpecifiedSchema, provider, opts)) :: Nil + tableIdent, userSpecifiedSchema, provider, opts)) :: Nil case c: CreateTableUsing if !c.temporary => sys.error("Tables created with SQLContext must be TEMPORARY. Use a HiveContext instead.") case c: CreateTableUsing if c.temporary && c.allowExisting => sys.error("allowExisting should be set to false when creating a temporary table.") - case CreateTableUsingAsSelect(tableName, provider, true, partitionsCols, mode, opts, query) + case CreateTableUsingAsSelect(tableIdent, provider, true, partitionsCols, mode, opts, query) if partitionsCols.nonEmpty => sys.error("Cannot create temporary partitioned table.") - case CreateTableUsingAsSelect(tableName, provider, true, _, mode, opts, query) => + case CreateTableUsingAsSelect(tableIdent, provider, true, _, mode, opts, query) => val cmd = CreateTempTableUsingAsSelect( - tableName, provider, Array.empty[String], mode, opts, query) + tableIdent, provider, Array.empty[String], mode, opts, query) ExecutedCommand(cmd) :: Nil case c: CreateTableUsingAsSelect if !c.temporary => sys.error("Tables created with SQLContext must be TEMPORARY. Use a HiveContext instead.") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala index 16498da080c8..e060c06d9e2a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution -import java.io.{DataInputStream, DataOutputStream, OutputStream, InputStream} +import java.io._ import java.nio.ByteBuffer import scala.reflect.ClassTag @@ -26,7 +26,7 @@ import com.google.common.io.ByteStreams import org.apache.spark.serializer.{SerializationStream, DeserializationStream, SerializerInstance, Serializer} import org.apache.spark.sql.catalyst.expressions.UnsafeRow -import org.apache.spark.unsafe.PlatformDependent +import org.apache.spark.unsafe.Platform /** * Serializer for serializing [[UnsafeRow]]s during shuffle. Since UnsafeRows are already stored as @@ -58,19 +58,21 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst */ override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream { private[this] var writeBuffer: Array[Byte] = new Array[Byte](4096) - private[this] val dOut: DataOutputStream = new DataOutputStream(out) + private[this] val dOut: DataOutputStream = + new DataOutputStream(new BufferedOutputStream(out)) override def writeValue[T: ClassTag](value: T): SerializationStream = { val row = value.asInstanceOf[UnsafeRow] + dOut.writeInt(row.getSizeInBytes) - row.writeToStream(out, writeBuffer) + row.writeToStream(dOut, writeBuffer) this } override def writeKey[T: ClassTag](key: T): SerializationStream = { // The key is only needed on the map side when computing partition ids. It does not need to // be shuffled. - assert(key.isInstanceOf[Int]) + assert(null == key || key.isInstanceOf[Int]) this } @@ -97,7 +99,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst override def deserializeStream(in: InputStream): DeserializationStream = { new DeserializationStream { - private[this] val dIn: DataInputStream = new DataInputStream(in) + private[this] val dIn: DataInputStream = new DataInputStream(new BufferedInputStream(in)) // 1024 is a default buffer size; this buffer will grow to accommodate larger rows private[this] var rowBuffer: Array[Byte] = new Array[Byte](1024) private[this] var row: UnsafeRow = new UnsafeRow() @@ -106,6 +108,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst override def asKeyValueIterator: Iterator[(Int, UnsafeRow)] = { new Iterator[(Int, UnsafeRow)] { private[this] var rowSize: Int = dIn.readInt() + if (rowSize == EOF) dIn.close() override def hasNext: Boolean = rowSize != EOF @@ -113,10 +116,11 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst if (rowBuffer.length < rowSize) { rowBuffer = new Array[Byte](rowSize) } - ByteStreams.readFully(in, rowBuffer, 0, rowSize) - row.pointTo(rowBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, rowSize) + ByteStreams.readFully(dIn, rowBuffer, 0, rowSize) + row.pointTo(rowBuffer, Platform.BYTE_ARRAY_OFFSET, numFields, rowSize) rowSize = dIn.readInt() // read the next row's size if (rowSize == EOF) { // We are returning the last row in this stream + dIn.close() val _rowTuple = rowTuple // Null these out so that the byte array can be garbage collected once the entire // iterator has been consumed @@ -147,8 +151,8 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst if (rowBuffer.length < rowSize) { rowBuffer = new Array[Byte](rowSize) } - ByteStreams.readFully(in, rowBuffer, 0, rowSize) - row.pointTo(rowBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, rowSize) + ByteStreams.readFully(dIn, rowBuffer, 0, rowSize) + row.pointTo(rowBuffer, Platform.BYTE_ARRAY_OFFSET, numFields, rowSize) row.asInstanceOf[T] } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala index fe9f2c702817..0269d6d4b7a1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala @@ -101,6 +101,8 @@ case class Window( override def outputOrdering: Seq[SortOrder] = child.outputOrdering + override def canProcessUnsafeRows: Boolean = true + /** * Create a bound ordering object for a given frame type and offset. A bound ordering object is * used to determine which input row lies within the frame boundaries of an output row. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/Aggregate.scala deleted file mode 100644 index cf568dc04867..000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/Aggregate.scala +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.aggregate - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.errors._ -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate._ -import org.apache.spark.sql.catalyst.plans.physical.{UnspecifiedDistribution, ClusteredDistribution, AllTuples, Distribution} -import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, SparkPlan, UnaryNode} -import org.apache.spark.sql.types.StructType - -/** - * An Aggregate Operator used to evaluate [[AggregateFunction2]]. Based on the data types - * of the grouping expressions and aggregate functions, it determines if it uses - * sort-based aggregation and hybrid (hash-based with sort-based as the fallback) to - * process input rows. - */ -case class Aggregate( - requiredChildDistributionExpressions: Option[Seq[Expression]], - groupingExpressions: Seq[NamedExpression], - nonCompleteAggregateExpressions: Seq[AggregateExpression2], - nonCompleteAggregateAttributes: Seq[Attribute], - completeAggregateExpressions: Seq[AggregateExpression2], - completeAggregateAttributes: Seq[Attribute], - initialInputBufferOffset: Int, - resultExpressions: Seq[NamedExpression], - child: SparkPlan) - extends UnaryNode { - - private[this] val allAggregateExpressions = - nonCompleteAggregateExpressions ++ completeAggregateExpressions - - private[this] val hasNonAlgebricAggregateFunctions = - !allAggregateExpressions.forall(_.aggregateFunction.isInstanceOf[AlgebraicAggregate]) - - // Use the hybrid iterator if (1) unsafe is enabled, (2) the schemata of - // grouping key and aggregation buffer is supported; and (3) all - // aggregate functions are algebraic. - private[this] val supportsHybridIterator: Boolean = { - val aggregationBufferSchema: StructType = - StructType.fromAttributes( - allAggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes)) - val groupKeySchema: StructType = - StructType.fromAttributes(groupingExpressions.map(_.toAttribute)) - - val schemaSupportsUnsafe: Boolean = - UnsafeFixedWidthAggregationMap.supportsAggregationBufferSchema(aggregationBufferSchema) && - UnsafeProjection.canSupport(groupKeySchema) - - // TODO: Use the hybrid iterator for non-algebric aggregate functions. - sqlContext.conf.unsafeEnabled && schemaSupportsUnsafe && !hasNonAlgebricAggregateFunctions - } - - // We need to use sorted input if we have grouping expressions, and - // we cannot use the hybrid iterator or the hybrid is disabled. - private[this] val requiresSortedInput: Boolean = { - groupingExpressions.nonEmpty && !supportsHybridIterator - } - - override def canProcessUnsafeRows: Boolean = !hasNonAlgebricAggregateFunctions - - // If result expressions' data types are all fixed length, we generate unsafe rows - // (We have this requirement instead of check the result of UnsafeProjection.canSupport - // is because we use a mutable projection to generate the result). - override def outputsUnsafeRows: Boolean = { - // resultExpressions.map(_.dataType).forall(UnsafeRow.isFixedLength) - // TODO: Supports generating UnsafeRows. We can just re-enable the line above and fix - // any issue we get. - false - } - - override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute) - - override def requiredChildDistribution: List[Distribution] = { - requiredChildDistributionExpressions match { - case Some(exprs) if exprs.length == 0 => AllTuples :: Nil - case Some(exprs) if exprs.length > 0 => ClusteredDistribution(exprs) :: Nil - case None => UnspecifiedDistribution :: Nil - } - } - - override def requiredChildOrdering: Seq[Seq[SortOrder]] = { - if (requiresSortedInput) { - // TODO: We should not sort the input rows if they are just in reversed order. - groupingExpressions.map(SortOrder(_, Ascending)) :: Nil - } else { - Seq.fill(children.size)(Nil) - } - } - - override def outputOrdering: Seq[SortOrder] = { - if (requiresSortedInput) { - // It is possible that the child.outputOrdering starts with the required - // ordering expressions (e.g. we require [a] as the sort expression and the - // child's outputOrdering is [a, b]). We can only guarantee the output rows - // are sorted by values of groupingExpressions. - groupingExpressions.map(SortOrder(_, Ascending)) - } else { - Nil - } - } - - protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") { - child.execute().mapPartitions { iter => - // Because the constructor of an aggregation iterator will read at least the first row, - // we need to get the value of iter.hasNext first. - val hasInput = iter.hasNext - val useHybridIterator = - hasInput && - supportsHybridIterator && - groupingExpressions.nonEmpty - if (useHybridIterator) { - UnsafeHybridAggregationIterator.createFromInputIterator( - groupingExpressions, - nonCompleteAggregateExpressions, - nonCompleteAggregateAttributes, - completeAggregateExpressions, - completeAggregateAttributes, - initialInputBufferOffset, - resultExpressions, - newMutableProjection _, - child.output, - iter, - outputsUnsafeRows) - } else { - if (!hasInput && groupingExpressions.nonEmpty) { - // This is a grouped aggregate and the input iterator is empty, - // so return an empty iterator. - Iterator[InternalRow]() - } else { - val outputIter = SortBasedAggregationIterator.createFromInputIterator( - groupingExpressions, - nonCompleteAggregateExpressions, - nonCompleteAggregateAttributes, - completeAggregateExpressions, - completeAggregateAttributes, - initialInputBufferOffset, - resultExpressions, - newMutableProjection _ , - newProjection _, - child.output, - iter, - outputsUnsafeRows) - if (!hasInput && groupingExpressions.isEmpty) { - // There is no input and there is no grouping expressions. - // We need to output a single row as the output. - Iterator[InternalRow](outputIter.outputForEmptyGroupingKeyWithoutInput()) - } else { - outputIter - } - } - } - } - } - - override def simpleString: String = { - val iterator = if (supportsHybridIterator && groupingExpressions.nonEmpty) { - classOf[UnsafeHybridAggregationIterator].getSimpleName - } else { - classOf[SortBasedAggregationIterator].getSimpleName - } - - s"""NewAggregate with $iterator ${groupingExpressions} ${allAggregateExpressions}""" - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala new file mode 100644 index 000000000000..f4c14a9b3556 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.aggregate + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.errors._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.catalyst.plans.physical.{UnspecifiedDistribution, ClusteredDistribution, AllTuples, Distribution} +import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, SparkPlan, UnaryNode} +import org.apache.spark.sql.execution.metric.SQLMetrics +import org.apache.spark.sql.types.StructType + +case class SortBasedAggregate( + requiredChildDistributionExpressions: Option[Seq[Expression]], + groupingExpressions: Seq[NamedExpression], + nonCompleteAggregateExpressions: Seq[AggregateExpression2], + nonCompleteAggregateAttributes: Seq[Attribute], + completeAggregateExpressions: Seq[AggregateExpression2], + completeAggregateAttributes: Seq[Attribute], + initialInputBufferOffset: Int, + resultExpressions: Seq[NamedExpression], + child: SparkPlan) + extends UnaryNode { + + override private[sql] lazy val metrics = Map( + "numInputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of input rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + + override def outputsUnsafeRows: Boolean = false + + override def canProcessUnsafeRows: Boolean = false + + override def canProcessSafeRows: Boolean = true + + override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute) + + override def requiredChildDistribution: List[Distribution] = { + requiredChildDistributionExpressions match { + case Some(exprs) if exprs.length == 0 => AllTuples :: Nil + case Some(exprs) if exprs.length > 0 => ClusteredDistribution(exprs) :: Nil + case None => UnspecifiedDistribution :: Nil + } + } + + override def requiredChildOrdering: Seq[Seq[SortOrder]] = { + groupingExpressions.map(SortOrder(_, Ascending)) :: Nil + } + + override def outputOrdering: Seq[SortOrder] = { + groupingExpressions.map(SortOrder(_, Ascending)) + } + + protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") { + val numInputRows = longMetric("numInputRows") + val numOutputRows = longMetric("numOutputRows") + child.execute().mapPartitions { iter => + // Because the constructor of an aggregation iterator will read at least the first row, + // we need to get the value of iter.hasNext first. + val hasInput = iter.hasNext + if (!hasInput && groupingExpressions.nonEmpty) { + // This is a grouped aggregate and the input iterator is empty, + // so return an empty iterator. + Iterator[InternalRow]() + } else { + val outputIter = SortBasedAggregationIterator.createFromInputIterator( + groupingExpressions, + nonCompleteAggregateExpressions, + nonCompleteAggregateAttributes, + completeAggregateExpressions, + completeAggregateAttributes, + initialInputBufferOffset, + resultExpressions, + newMutableProjection _, + newProjection _, + child.output, + iter, + outputsUnsafeRows, + numInputRows, + numOutputRows) + if (!hasInput && groupingExpressions.isEmpty) { + // There is no input and there is no grouping expressions. + // We need to output a single row as the output. + numOutputRows += 1 + Iterator[InternalRow](outputIter.outputForEmptyGroupingKeyWithoutInput()) + } else { + outputIter + } + } + } + } + + override def simpleString: String = { + val allAggregateExpressions = nonCompleteAggregateExpressions ++ completeAggregateExpressions + + val keyString = groupingExpressions.mkString("[", ",", "]") + val functionString = allAggregateExpressions.mkString("[", ",", "]") + val outputString = output.mkString("[", ",", "]") + s"SortBasedAggregate(key=$keyString, functions=$functionString, output=$outputString)" + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala index 78bcee16c9d0..73d50e07cf0b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala @@ -20,8 +20,7 @@ package org.apache.spark.sql.execution.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression2, AggregateFunction2} -import org.apache.spark.sql.execution.UnsafeFixedWidthAggregationMap -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.execution.metric.LongSQLMetric import org.apache.spark.unsafe.KVIterator /** @@ -39,7 +38,9 @@ class SortBasedAggregationIterator( initialInputBufferOffset: Int, resultExpressions: Seq[NamedExpression], newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection), - outputsUnsafeRows: Boolean) + outputsUnsafeRows: Boolean, + numInputRows: LongSQLMetric, + numOutputRows: LongSQLMetric) extends AggregationIterator( groupingKeyAttributes, valueAttributes, @@ -57,7 +58,7 @@ class SortBasedAggregationIterator( val bufferRowSize: Int = bufferSchema.length val genericMutableBuffer = new GenericMutableRow(bufferRowSize) - val useUnsafeBuffer = bufferSchema.map(_.dataType).forall(UnsafeRow.isFixedLength) + val useUnsafeBuffer = bufferSchema.map(_.dataType).forall(UnsafeRow.isMutable) val buffer = if (useUnsafeBuffer) { val unsafeProjection = @@ -105,6 +106,7 @@ class SortBasedAggregationIterator( // Get the grouping key. val groupingKey = inputKVIterator.getKey val currentRow = inputKVIterator.getValue + numInputRows += 1 // Check if the current row belongs the current input row. if (currentGroupingKey == groupingKey) { @@ -139,7 +141,7 @@ class SortBasedAggregationIterator( val outputRow = generateOutput(currentGroupingKey, sortBasedAggregationBuffer) // Initialize buffer values for the next group. initializeBuffer(sortBasedAggregationBuffer) - + numOutputRows += 1 outputRow } else { // no more result @@ -153,7 +155,7 @@ class SortBasedAggregationIterator( nextGroupingKey = inputKVIterator.getKey().copy() firstRowInNextGroup = inputKVIterator.getValue().copy() - + numInputRows += 1 sortedInputHasNewGroup = true } else { // This inputIter is empty. @@ -183,7 +185,9 @@ object SortBasedAggregationIterator { newProjection: (Seq[Expression], Seq[Attribute]) => Projection, inputAttributes: Seq[Attribute], inputIter: Iterator[InternalRow], - outputsUnsafeRows: Boolean): SortBasedAggregationIterator = { + outputsUnsafeRows: Boolean, + numInputRows: LongSQLMetric, + numOutputRows: LongSQLMetric): SortBasedAggregationIterator = { val kvIterator = if (UnsafeProjection.canSupport(groupingExprs)) { AggregationIterator.unsafeKVIterator( groupingExprs, @@ -204,33 +208,9 @@ object SortBasedAggregationIterator { initialInputBufferOffset, resultExpressions, newMutableProjection, - outputsUnsafeRows) - } - - def createFromKVIterator( - groupingKeyAttributes: Seq[Attribute], - valueAttributes: Seq[Attribute], - inputKVIterator: KVIterator[InternalRow, InternalRow], - nonCompleteAggregateExpressions: Seq[AggregateExpression2], - nonCompleteAggregateAttributes: Seq[Attribute], - completeAggregateExpressions: Seq[AggregateExpression2], - completeAggregateAttributes: Seq[Attribute], - initialInputBufferOffset: Int, - resultExpressions: Seq[NamedExpression], - newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection), - outputsUnsafeRows: Boolean): SortBasedAggregationIterator = { - new SortBasedAggregationIterator( - groupingKeyAttributes, - valueAttributes, - inputKVIterator, - nonCompleteAggregateExpressions, - nonCompleteAggregateAttributes, - completeAggregateExpressions, - completeAggregateAttributes, - initialInputBufferOffset, - resultExpressions, - newMutableProjection, - outputsUnsafeRows) + outputsUnsafeRows, + numInputRows, + numOutputRows) } // scalastyle:on } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala new file mode 100644 index 000000000000..ba379d358d20 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.aggregate + +import org.apache.spark.TaskContext +import org.apache.spark.rdd.{MapPartitionsWithPreparationRDD, RDD} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.errors._ +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2 +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.execution.{UnaryNode, SparkPlan} +import org.apache.spark.sql.execution.metric.SQLMetrics + +case class TungstenAggregate( + requiredChildDistributionExpressions: Option[Seq[Expression]], + groupingExpressions: Seq[NamedExpression], + nonCompleteAggregateExpressions: Seq[AggregateExpression2], + completeAggregateExpressions: Seq[AggregateExpression2], + initialInputBufferOffset: Int, + resultExpressions: Seq[NamedExpression], + child: SparkPlan) + extends UnaryNode { + + override private[sql] lazy val metrics = Map( + "numInputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of input rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + + override def outputsUnsafeRows: Boolean = true + + override def canProcessUnsafeRows: Boolean = true + + override def canProcessSafeRows: Boolean = true + + override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute) + + override def requiredChildDistribution: List[Distribution] = { + requiredChildDistributionExpressions match { + case Some(exprs) if exprs.length == 0 => AllTuples :: Nil + case Some(exprs) if exprs.length > 0 => ClusteredDistribution(exprs) :: Nil + case None => UnspecifiedDistribution :: Nil + } + } + + // This is for testing. We force TungstenAggregationIterator to fall back to sort-based + // aggregation once it has processed a given number of input rows. + private val testFallbackStartsAt: Option[Int] = { + sqlContext.getConf("spark.sql.TungstenAggregate.testFallbackStartsAt", null) match { + case null | "" => None + case fallbackStartsAt => Some(fallbackStartsAt.toInt) + } + } + + protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") { + val numInputRows = longMetric("numInputRows") + val numOutputRows = longMetric("numOutputRows") + + /** + * Set up the underlying unsafe data structures used before computing the parent partition. + * This makes sure our iterator is not starved by other operators in the same task. + */ + def preparePartition(): TungstenAggregationIterator = { + new TungstenAggregationIterator( + groupingExpressions, + nonCompleteAggregateExpressions, + completeAggregateExpressions, + initialInputBufferOffset, + resultExpressions, + newMutableProjection, + child.output, + testFallbackStartsAt, + numInputRows, + numOutputRows) + } + + /** Compute a partition using the iterator already set up previously. */ + def executePartition( + context: TaskContext, + partitionIndex: Int, + aggregationIterator: TungstenAggregationIterator, + parentIterator: Iterator[InternalRow]): Iterator[UnsafeRow] = { + val hasInput = parentIterator.hasNext + if (!hasInput) { + // We're not using the underlying map, so we just can free it here + aggregationIterator.free() + if (groupingExpressions.isEmpty) { + numOutputRows += 1 + Iterator.single[UnsafeRow](aggregationIterator.outputForEmptyGroupingKeyWithoutInput()) + } else { + // This is a grouped aggregate and the input iterator is empty, + // so return an empty iterator. + Iterator.empty + } + } else { + aggregationIterator.start(parentIterator) + aggregationIterator + } + } + + // Note: we need to set up the iterator in each partition before computing the + // parent partition, so we cannot simply use `mapPartitions` here (SPARK-9747). + val resultRdd = { + new MapPartitionsWithPreparationRDD[UnsafeRow, InternalRow, TungstenAggregationIterator]( + child.execute(), preparePartition, executePartition, preservesPartitioning = true) + } + resultRdd.asInstanceOf[RDD[InternalRow]] + } + + override def simpleString: String = { + val allAggregateExpressions = nonCompleteAggregateExpressions ++ completeAggregateExpressions + + testFallbackStartsAt match { + case None => + val keyString = groupingExpressions.mkString("[", ",", "]") + val functionString = allAggregateExpressions.mkString("[", ",", "]") + val outputString = output.mkString("[", ",", "]") + s"TungstenAggregate(key=$keyString, functions=$functionString, output=$outputString)" + case Some(fallbackStartsAt) => + s"TungstenAggregateWithControlledFallback $groupingExpressions " + + s"$allAggregateExpressions $resultExpressions fallbackStartsAt=$fallbackStartsAt" + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala new file mode 100644 index 000000000000..26fdbc83ef50 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala @@ -0,0 +1,721 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.aggregate + +import org.apache.spark.unsafe.KVIterator +import org.apache.spark.{InternalAccumulator, Logging, SparkEnv, TaskContext} +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.{UnsafeKVExternalSorter, UnsafeFixedWidthAggregationMap} +import org.apache.spark.sql.execution.metric.LongSQLMetric +import org.apache.spark.sql.types.StructType + +/** + * An iterator used to evaluate aggregate functions. It operates on [[UnsafeRow]]s. + * + * This iterator first uses hash-based aggregation to process input rows. It uses + * a hash map to store groups and their corresponding aggregation buffers. If we + * this map cannot allocate memory from [[org.apache.spark.shuffle.ShuffleMemoryManager]], + * it switches to sort-based aggregation. The process of the switch has the following step: + * - Step 1: Sort all entries of the hash map based on values of grouping expressions and + * spill them to disk. + * - Step 2: Create a external sorter based on the spilled sorted map entries. + * - Step 3: Redirect all input rows to the external sorter. + * - Step 4: Get a sorted [[KVIterator]] from the external sorter. + * - Step 5: Initialize sort-based aggregation. + * Then, this iterator works in the way of sort-based aggregation. + * + * The code of this class is organized as follows: + * - Part 1: Initializing aggregate functions. + * - Part 2: Methods and fields used by setting aggregation buffer values, + * processing input rows from inputIter, and generating output + * rows. + * - Part 3: Methods and fields used by hash-based aggregation. + * - Part 4: Methods and fields used when we switch to sort-based aggregation. + * - Part 5: Methods and fields used by sort-based aggregation. + * - Part 6: Loads input and process input rows. + * - Part 7: Public methods of this iterator. + * - Part 8: A utility function used to generate a result when there is no + * input and there is no grouping expression. + * + * @param groupingExpressions + * expressions for grouping keys + * @param nonCompleteAggregateExpressions + * [[AggregateExpression2]] containing [[AggregateFunction2]]s with mode [[Partial]], + * [[PartialMerge]], or [[Final]]. + * @param completeAggregateExpressions + * [[AggregateExpression2]] containing [[AggregateFunction2]]s with mode [[Complete]]. + * @param initialInputBufferOffset + * If this iterator is used to handle functions with mode [[PartialMerge]] or [[Final]]. + * The input rows have the format of `grouping keys + aggregation buffer`. + * This offset indicates the starting position of aggregation buffer in a input row. + * @param resultExpressions + * expressions for generating output rows. + * @param newMutableProjection + * the function used to create mutable projections. + * @param originalInputAttributes + * attributes of representing input rows from `inputIter`. + */ +class TungstenAggregationIterator( + groupingExpressions: Seq[NamedExpression], + nonCompleteAggregateExpressions: Seq[AggregateExpression2], + completeAggregateExpressions: Seq[AggregateExpression2], + initialInputBufferOffset: Int, + resultExpressions: Seq[NamedExpression], + newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection), + originalInputAttributes: Seq[Attribute], + testFallbackStartsAt: Option[Int], + numInputRows: LongSQLMetric, + numOutputRows: LongSQLMetric) + extends Iterator[UnsafeRow] with Logging { + + // The parent partition iterator, to be initialized later in `start` + private[this] var inputIter: Iterator[InternalRow] = null + + /////////////////////////////////////////////////////////////////////////// + // Part 1: Initializing aggregate functions. + /////////////////////////////////////////////////////////////////////////// + + // A Seq containing all AggregateExpressions. + // It is important that all AggregateExpressions with the mode Partial, PartialMerge or Final + // are at the beginning of the allAggregateExpressions. + private[this] val allAggregateExpressions: Seq[AggregateExpression2] = + nonCompleteAggregateExpressions ++ completeAggregateExpressions + + // Check to make sure we do not have more than three modes in our AggregateExpressions. + // If we have, users are hitting a bug and we throw an IllegalStateException. + if (allAggregateExpressions.map(_.mode).distinct.length > 2) { + throw new IllegalStateException( + s"$allAggregateExpressions should have no more than 2 kinds of modes.") + } + + // + // The modes of AggregateExpressions. Right now, we can handle the following mode: + // - Partial-only: + // All AggregateExpressions have the mode of Partial. + // For this case, aggregationMode is (Some(Partial), None). + // - PartialMerge-only: + // All AggregateExpressions have the mode of PartialMerge). + // For this case, aggregationMode is (Some(PartialMerge), None). + // - Final-only: + // All AggregateExpressions have the mode of Final. + // For this case, aggregationMode is (Some(Final), None). + // - Final-Complete: + // Some AggregateExpressions have the mode of Final and + // others have the mode of Complete. For this case, + // aggregationMode is (Some(Final), Some(Complete)). + // - Complete-only: + // nonCompleteAggregateExpressions is empty and we have AggregateExpressions + // with mode Complete in completeAggregateExpressions. For this case, + // aggregationMode is (None, Some(Complete)). + // - Grouping-only: + // There is no AggregateExpression. For this case, AggregationMode is (None,None). + // + private[this] var aggregationMode: (Option[AggregateMode], Option[AggregateMode]) = { + nonCompleteAggregateExpressions.map(_.mode).distinct.headOption -> + completeAggregateExpressions.map(_.mode).distinct.headOption + } + + // All aggregate functions. TungstenAggregationIterator only handles AlgebraicAggregates. + // If there is any functions that is not an AlgebraicAggregate, we throw an + // IllegalStateException. + private[this] val allAggregateFunctions: Array[AlgebraicAggregate] = { + if (!allAggregateExpressions.forall(_.aggregateFunction.isInstanceOf[AlgebraicAggregate])) { + throw new IllegalStateException( + "Only AlgebraicAggregates should be passed in TungstenAggregationIterator.") + } + + allAggregateExpressions + .map(_.aggregateFunction.asInstanceOf[AlgebraicAggregate]) + .toArray + } + + /////////////////////////////////////////////////////////////////////////// + // Part 2: Methods and fields used by setting aggregation buffer values, + // processing input rows from inputIter, and generating output + // rows. + /////////////////////////////////////////////////////////////////////////// + + // The projection used to initialize buffer values. + private[this] val algebraicInitialProjection: MutableProjection = { + val initExpressions = allAggregateFunctions.flatMap(_.initialValues) + newMutableProjection(initExpressions, Nil)() + } + + // Creates a new aggregation buffer and initializes buffer values. + // This functions should be only called at most three times (when we create the hash map, + // when we switch to sort-based aggregation, and when we create the re-used buffer for + // sort-based aggregation). + private def createNewAggregationBuffer(): UnsafeRow = { + val bufferSchema = allAggregateFunctions.flatMap(_.bufferAttributes) + val bufferRowSize: Int = bufferSchema.length + + val genericMutableBuffer = new GenericMutableRow(bufferRowSize) + val unsafeProjection = + UnsafeProjection.create(bufferSchema.map(_.dataType)) + val buffer = unsafeProjection.apply(genericMutableBuffer) + algebraicInitialProjection.target(buffer)(EmptyRow) + buffer + } + + // Creates a function used to process a row based on the given inputAttributes. + private def generateProcessRow( + inputAttributes: Seq[Attribute]): (UnsafeRow, InternalRow) => Unit = { + + val aggregationBufferAttributes = allAggregateFunctions.flatMap(_.bufferAttributes) + val joinedRow = new JoinedRow() + + aggregationMode match { + // Partial-only + case (Some(Partial), None) => + val updateExpressions = allAggregateFunctions.flatMap(_.updateExpressions) + val algebraicUpdateProjection = + newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)() + + (currentBuffer: UnsafeRow, row: InternalRow) => { + algebraicUpdateProjection.target(currentBuffer) + algebraicUpdateProjection(joinedRow(currentBuffer, row)) + } + + // PartialMerge-only or Final-only + case (Some(PartialMerge), None) | (Some(Final), None) => + val mergeExpressions = allAggregateFunctions.flatMap(_.mergeExpressions) + // This projection is used to merge buffer values for all AlgebraicAggregates. + val algebraicMergeProjection = + newMutableProjection( + mergeExpressions, + aggregationBufferAttributes ++ inputAttributes)() + + (currentBuffer: UnsafeRow, row: InternalRow) => { + // Process all algebraic aggregate functions. + algebraicMergeProjection.target(currentBuffer) + algebraicMergeProjection(joinedRow(currentBuffer, row)) + } + + // Final-Complete + case (Some(Final), Some(Complete)) => + val nonCompleteAggregateFunctions: Array[AlgebraicAggregate] = + allAggregateFunctions.take(nonCompleteAggregateExpressions.length) + val completeAggregateFunctions: Array[AlgebraicAggregate] = + allAggregateFunctions.takeRight(completeAggregateExpressions.length) + + val completeOffsetExpressions = + Seq.fill(completeAggregateFunctions.map(_.bufferAttributes.length).sum)(NoOp) + val mergeExpressions = + nonCompleteAggregateFunctions.flatMap(_.mergeExpressions) ++ completeOffsetExpressions + val finalAlgebraicMergeProjection = + newMutableProjection( + mergeExpressions, + aggregationBufferAttributes ++ inputAttributes)() + + // We do not touch buffer values of aggregate functions with the Final mode. + val finalOffsetExpressions = + Seq.fill(nonCompleteAggregateFunctions.map(_.bufferAttributes.length).sum)(NoOp) + val updateExpressions = + finalOffsetExpressions ++ completeAggregateFunctions.flatMap(_.updateExpressions) + val completeAlgebraicUpdateProjection = + newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)() + + (currentBuffer: UnsafeRow, row: InternalRow) => { + val input = joinedRow(currentBuffer, row) + // For all aggregate functions with mode Complete, update the given currentBuffer. + completeAlgebraicUpdateProjection.target(currentBuffer)(input) + + // For all aggregate functions with mode Final, merge buffer values in row to + // currentBuffer. + finalAlgebraicMergeProjection.target(currentBuffer)(input) + } + + // Complete-only + case (None, Some(Complete)) => + val completeAggregateFunctions: Array[AlgebraicAggregate] = + allAggregateFunctions.takeRight(completeAggregateExpressions.length) + + val updateExpressions = + completeAggregateFunctions.flatMap(_.updateExpressions) + val completeAlgebraicUpdateProjection = + newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)() + + (currentBuffer: UnsafeRow, row: InternalRow) => { + completeAlgebraicUpdateProjection.target(currentBuffer) + // For all aggregate functions with mode Complete, update the given currentBuffer. + completeAlgebraicUpdateProjection(joinedRow(currentBuffer, row)) + } + + // Grouping only. + case (None, None) => (currentBuffer: UnsafeRow, row: InternalRow) => {} + + case other => + throw new IllegalStateException( + s"${aggregationMode} should not be passed into TungstenAggregationIterator.") + } + } + + // Creates a function used to generate output rows. + private def generateResultProjection(): (UnsafeRow, UnsafeRow) => UnsafeRow = { + + val groupingAttributes = groupingExpressions.map(_.toAttribute) + val bufferAttributes = allAggregateFunctions.flatMap(_.bufferAttributes) + + aggregationMode match { + // Partial-only or PartialMerge-only: every output row is basically the values of + // the grouping expressions and the corresponding aggregation buffer. + case (Some(Partial), None) | (Some(PartialMerge), None) => + val groupingKeySchema = StructType.fromAttributes(groupingAttributes) + val bufferSchema = StructType.fromAttributes(bufferAttributes) + val unsafeRowJoiner = GenerateUnsafeRowJoiner.create(groupingKeySchema, bufferSchema) + + (currentGroupingKey: UnsafeRow, currentBuffer: UnsafeRow) => { + unsafeRowJoiner.join(currentGroupingKey, currentBuffer) + } + + // Final-only, Complete-only and Final-Complete: a output row is generated based on + // resultExpressions. + case (Some(Final), None) | (Some(Final) | None, Some(Complete)) => + val joinedRow = new JoinedRow() + val resultProjection = + UnsafeProjection.create(resultExpressions, groupingAttributes ++ bufferAttributes) + + (currentGroupingKey: UnsafeRow, currentBuffer: UnsafeRow) => { + resultProjection(joinedRow(currentGroupingKey, currentBuffer)) + } + + // Grouping-only: a output row is generated from values of grouping expressions. + case (None, None) => + val resultProjection = + UnsafeProjection.create(resultExpressions, groupingAttributes) + + (currentGroupingKey: UnsafeRow, currentBuffer: UnsafeRow) => { + resultProjection(currentGroupingKey) + } + + case other => + throw new IllegalStateException( + s"${aggregationMode} should not be passed into TungstenAggregationIterator.") + } + } + + // An UnsafeProjection used to extract grouping keys from the input rows. + private[this] val groupProjection = + UnsafeProjection.create(groupingExpressions, originalInputAttributes) + + // A function used to process a input row. Its first argument is the aggregation buffer + // and the second argument is the input row. + private[this] var processRow: (UnsafeRow, InternalRow) => Unit = + generateProcessRow(originalInputAttributes) + + // A function used to generate output rows based on the grouping keys (first argument) + // and the corresponding aggregation buffer (second argument). + private[this] var generateOutput: (UnsafeRow, UnsafeRow) => UnsafeRow = + generateResultProjection() + + // An aggregation buffer containing initial buffer values. It is used to + // initialize other aggregation buffers. + private[this] val initialAggregationBuffer: UnsafeRow = createNewAggregationBuffer() + + /////////////////////////////////////////////////////////////////////////// + // Part 3: Methods and fields used by hash-based aggregation. + /////////////////////////////////////////////////////////////////////////// + + // This is the hash map used for hash-based aggregation. It is backed by an + // UnsafeFixedWidthAggregationMap and it is used to store + // all groups and their corresponding aggregation buffers for hash-based aggregation. + private[this] val hashMap = new UnsafeFixedWidthAggregationMap( + initialAggregationBuffer, + StructType.fromAttributes(allAggregateFunctions.flatMap(_.bufferAttributes)), + StructType.fromAttributes(groupingExpressions.map(_.toAttribute)), + TaskContext.get.taskMemoryManager(), + SparkEnv.get.shuffleMemoryManager, + 1024 * 16, // initial capacity + SparkEnv.get.shuffleMemoryManager.pageSizeBytes, + false // disable tracking of performance metrics + ) + + // Exposed for testing + private[aggregate] def getHashMap: UnsafeFixedWidthAggregationMap = hashMap + + // The function used to read and process input rows. When processing input rows, + // it first uses hash-based aggregation by putting groups and their buffers in + // hashMap. If we could not allocate more memory for the map, we switch to + // sort-based aggregation (by calling switchToSortBasedAggregation). + private def processInputs(): Unit = { + assert(inputIter != null, "attempted to process input when iterator was null") + if (groupingExpressions.isEmpty) { + // If there is no grouping expressions, we can just reuse the same buffer over and over again. + // Note that it would be better to eliminate the hash map entirely in the future. + val groupingKey = groupProjection.apply(null) + val buffer: UnsafeRow = hashMap.getAggregationBufferFromUnsafeRow(groupingKey) + while (inputIter.hasNext) { + val newInput = inputIter.next() + numInputRows += 1 + processRow(buffer, newInput) + } + } else { + while (!sortBased && inputIter.hasNext) { + val newInput = inputIter.next() + numInputRows += 1 + val groupingKey = groupProjection.apply(newInput) + val buffer: UnsafeRow = hashMap.getAggregationBufferFromUnsafeRow(groupingKey) + if (buffer == null) { + // buffer == null means that we could not allocate more memory. + // Now, we need to spill the map and switch to sort-based aggregation. + switchToSortBasedAggregation(groupingKey, newInput) + } else { + processRow(buffer, newInput) + } + } + } + } + + // This function is only used for testing. It basically the same as processInputs except + // that it switch to sort-based aggregation after `fallbackStartsAt` input rows have + // been processed. + private def processInputsWithControlledFallback(fallbackStartsAt: Int): Unit = { + assert(inputIter != null, "attempted to process input when iterator was null") + var i = 0 + while (!sortBased && inputIter.hasNext) { + val newInput = inputIter.next() + numInputRows += 1 + val groupingKey = groupProjection.apply(newInput) + val buffer: UnsafeRow = if (i < fallbackStartsAt) { + hashMap.getAggregationBufferFromUnsafeRow(groupingKey) + } else { + null + } + if (buffer == null) { + // buffer == null means that we could not allocate more memory. + // Now, we need to spill the map and switch to sort-based aggregation. + switchToSortBasedAggregation(groupingKey, newInput) + } else { + processRow(buffer, newInput) + } + i += 1 + } + } + + // The iterator created from hashMap. It is used to generate output rows when we + // are using hash-based aggregation. + private[this] var aggregationBufferMapIterator: KVIterator[UnsafeRow, UnsafeRow] = null + + // Indicates if aggregationBufferMapIterator still has key-value pairs. + private[this] var mapIteratorHasNext: Boolean = false + + /////////////////////////////////////////////////////////////////////////// + // Part 4: Methods and fields used when we switch to sort-based aggregation. + /////////////////////////////////////////////////////////////////////////// + + // This sorter is used for sort-based aggregation. It is initialized as soon as + // we switch from hash-based to sort-based aggregation. Otherwise, it is not used. + private[this] var externalSorter: UnsafeKVExternalSorter = null + + /** + * Switch to sort-based aggregation when the hash-based approach is unable to acquire memory. + */ + private def switchToSortBasedAggregation(firstKey: UnsafeRow, firstInput: InternalRow): Unit = { + assert(inputIter != null, "attempted to process input when iterator was null") + logInfo("falling back to sort based aggregation.") + // Step 1: Get the ExternalSorter containing sorted entries of the map. + externalSorter = hashMap.destructAndCreateExternalSorter() + + // Step 2: Free the memory used by the map. + hashMap.free() + + // Step 3: If we have aggregate function with mode Partial or Complete, + // we need to process input rows to get aggregation buffer. + // So, later in the sort-based aggregation iterator, we can do merge. + // If aggregate functions are with mode Final and PartialMerge, + // we just need to project the aggregation buffer from an input row. + val needsProcess = aggregationMode match { + case (Some(Partial), None) => true + case (None, Some(Complete)) => true + case (Some(Final), Some(Complete)) => true + case _ => false + } + + // Note: Since we spill the sorter's contents immediately after creating it, we must insert + // something into the sorter here to ensure that we acquire at least a page of memory. + // This is done through `externalSorter.insertKV`, which will trigger the page allocation. + // Otherwise, children operators may steal the window of opportunity and starve our sorter. + + if (needsProcess) { + // First, we create a buffer. + val buffer = createNewAggregationBuffer() + + // Process firstKey and firstInput. + // Initialize buffer. + buffer.copyFrom(initialAggregationBuffer) + processRow(buffer, firstInput) + externalSorter.insertKV(firstKey, buffer) + + // Process the rest of input rows. + while (inputIter.hasNext) { + val newInput = inputIter.next() + numInputRows += 1 + val groupingKey = groupProjection.apply(newInput) + buffer.copyFrom(initialAggregationBuffer) + processRow(buffer, newInput) + externalSorter.insertKV(groupingKey, buffer) + } + } else { + // When needsProcess is false, the format of input rows is groupingKey + aggregation buffer. + // We need to project the aggregation buffer part from an input row. + val buffer = createNewAggregationBuffer() + // The originalInputAttributes are using cloneBufferAttributes. So, we need to use + // allAggregateFunctions.flatMap(_.cloneBufferAttributes). + val bufferExtractor = newMutableProjection( + allAggregateFunctions.flatMap(_.cloneBufferAttributes), + originalInputAttributes)() + bufferExtractor.target(buffer) + + // Insert firstKey and its buffer. + bufferExtractor(firstInput) + externalSorter.insertKV(firstKey, buffer) + + // Insert the rest of input rows. + while (inputIter.hasNext) { + val newInput = inputIter.next() + numInputRows += 1 + val groupingKey = groupProjection.apply(newInput) + bufferExtractor(newInput) + externalSorter.insertKV(groupingKey, buffer) + } + } + + // Set aggregationMode, processRow, and generateOutput for sort-based aggregation. + val newAggregationMode = aggregationMode match { + case (Some(Partial), None) => (Some(PartialMerge), None) + case (None, Some(Complete)) => (Some(Final), None) + case (Some(Final), Some(Complete)) => (Some(Final), None) + case other => other + } + aggregationMode = newAggregationMode + + // Basically the value of the KVIterator returned by externalSorter + // will just aggregation buffer. At here, we use cloneBufferAttributes. + val newInputAttributes: Seq[Attribute] = + allAggregateFunctions.flatMap(_.cloneBufferAttributes) + + // Set up new processRow and generateOutput. + processRow = generateProcessRow(newInputAttributes) + generateOutput = generateResultProjection() + + // Step 5: Get the sorted iterator from the externalSorter. + sortedKVIterator = externalSorter.sortedIterator() + + // Step 6: Pre-load the first key-value pair from the sorted iterator to make + // hasNext idempotent. + sortedInputHasNewGroup = sortedKVIterator.next() + + // Copy the first key and value (aggregation buffer). + if (sortedInputHasNewGroup) { + val key = sortedKVIterator.getKey + val value = sortedKVIterator.getValue + nextGroupingKey = key.copy() + currentGroupingKey = key.copy() + firstRowInNextGroup = value.copy() + } + + // Step 7: set sortBased to true. + sortBased = true + } + + /////////////////////////////////////////////////////////////////////////// + // Part 5: Methods and fields used by sort-based aggregation. + /////////////////////////////////////////////////////////////////////////// + + // Indicates if we are using sort-based aggregation. Because we first try to use + // hash-based aggregation, its initial value is false. + private[this] var sortBased: Boolean = false + + // The KVIterator containing input rows for the sort-based aggregation. It will be + // set in switchToSortBasedAggregation when we switch to sort-based aggregation. + private[this] var sortedKVIterator: UnsafeKVExternalSorter#KVSorterIterator = null + + // The grouping key of the current group. + private[this] var currentGroupingKey: UnsafeRow = null + + // The grouping key of next group. + private[this] var nextGroupingKey: UnsafeRow = null + + // The first row of next group. + private[this] var firstRowInNextGroup: UnsafeRow = null + + // Indicates if we has new group of rows from the sorted input iterator. + private[this] var sortedInputHasNewGroup: Boolean = false + + // The aggregation buffer used by the sort-based aggregation. + private[this] val sortBasedAggregationBuffer: UnsafeRow = createNewAggregationBuffer() + + // Processes rows in the current group. It will stop when it find a new group. + private def processCurrentSortedGroup(): Unit = { + // First, we need to copy nextGroupingKey to currentGroupingKey. + currentGroupingKey.copyFrom(nextGroupingKey) + // Now, we will start to find all rows belonging to this group. + // We create a variable to track if we see the next group. + var findNextPartition = false + // firstRowInNextGroup is the first row of this group. We first process it. + processRow(sortBasedAggregationBuffer, firstRowInNextGroup) + + // The search will stop when we see the next group or there is no + // input row left in the iter. + // Pre-load the first key-value pair to make the condition of the while loop + // has no action (we do not trigger loading a new key-value pair + // when we evaluate the condition). + var hasNext = sortedKVIterator.next() + while (!findNextPartition && hasNext) { + // Get the grouping key and value (aggregation buffer). + val groupingKey = sortedKVIterator.getKey + val inputAggregationBuffer = sortedKVIterator.getValue + + // Check if the current row belongs the current input row. + if (currentGroupingKey.equals(groupingKey)) { + processRow(sortBasedAggregationBuffer, inputAggregationBuffer) + + hasNext = sortedKVIterator.next() + } else { + // We find a new group. + findNextPartition = true + // copyFrom will fail when + nextGroupingKey.copyFrom(groupingKey) // = groupingKey.copy() + firstRowInNextGroup.copyFrom(inputAggregationBuffer) // = inputAggregationBuffer.copy() + + } + } + // We have not seen a new group. It means that there is no new row in the input + // iter. The current group is the last group of the sortedKVIterator. + if (!findNextPartition) { + sortedInputHasNewGroup = false + sortedKVIterator.close() + } + } + + /////////////////////////////////////////////////////////////////////////// + // Part 6: Loads input rows and setup aggregationBufferMapIterator if we + // have not switched to sort-based aggregation. + /////////////////////////////////////////////////////////////////////////// + + /** + * Start processing input rows. + * Only after this method is called will this iterator be non-empty. + */ + def start(parentIter: Iterator[InternalRow]): Unit = { + inputIter = parentIter + testFallbackStartsAt match { + case None => + processInputs() + case Some(fallbackStartsAt) => + // This is the testing path. processInputsWithControlledFallback is same as processInputs + // except that it switches to sort-based aggregation after `fallbackStartsAt` input rows + // have been processed. + processInputsWithControlledFallback(fallbackStartsAt) + } + + // If we did not switch to sort-based aggregation in processInputs, + // we pre-load the first key-value pair from the map (to make hasNext idempotent). + if (!sortBased) { + // First, set aggregationBufferMapIterator. + aggregationBufferMapIterator = hashMap.iterator() + // Pre-load the first key-value pair from the aggregationBufferMapIterator. + mapIteratorHasNext = aggregationBufferMapIterator.next() + // If the map is empty, we just free it. + if (!mapIteratorHasNext) { + hashMap.free() + } + } + } + + /////////////////////////////////////////////////////////////////////////// + // Part 7: Iterator's public methods. + /////////////////////////////////////////////////////////////////////////// + + override final def hasNext: Boolean = { + (sortBased && sortedInputHasNewGroup) || (!sortBased && mapIteratorHasNext) + } + + override final def next(): UnsafeRow = { + if (hasNext) { + val res = if (sortBased) { + // Process the current group. + processCurrentSortedGroup() + // Generate output row for the current group. + val outputRow = generateOutput(currentGroupingKey, sortBasedAggregationBuffer) + // Initialize buffer values for the next group. + sortBasedAggregationBuffer.copyFrom(initialAggregationBuffer) + + outputRow + } else { + // We did not fall back to sort-based aggregation. + val result = + generateOutput( + aggregationBufferMapIterator.getKey, + aggregationBufferMapIterator.getValue) + + // Pre-load next key-value pair form aggregationBufferMapIterator to make hasNext + // idempotent. + mapIteratorHasNext = aggregationBufferMapIterator.next() + + if (!mapIteratorHasNext) { + // If there is no input from aggregationBufferMapIterator, we copy current result. + val resultCopy = result.copy() + // Then, we free the map. + hashMap.free() + + resultCopy + } else { + result + } + } + + // If this is the last record, update the task's peak memory usage. Since we destroy + // the map to create the sorter, their memory usages should not overlap, so it is safe + // to just use the max of the two. + if (!hasNext) { + val mapMemory = hashMap.getPeakMemoryUsedBytes + val sorterMemory = Option(externalSorter).map(_.getPeakMemoryUsedBytes).getOrElse(0L) + val peakMemory = Math.max(mapMemory, sorterMemory) + TaskContext.get().internalMetricsToAccumulators( + InternalAccumulator.PEAK_EXECUTION_MEMORY).add(peakMemory) + } + numOutputRows += 1 + res + } else { + // no more result + throw new NoSuchElementException + } + } + + /////////////////////////////////////////////////////////////////////////// + // Part 8: Utility functions + /////////////////////////////////////////////////////////////////////////// + + /** + * Generate a output row when there is no input and there is no grouping expression. + */ + def outputForEmptyGroupingKeyWithoutInput(): UnsafeRow = { + assert(groupingExpressions.isEmpty) + assert(inputIter == null) + generateOutput(UnsafeRow.createFromByteArray(0, 0), initialAggregationBuffer) + } + + /** Free memory used in the underlying map. */ + def free(): Unit = { + hashMap.free() + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/UnsafeHybridAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/UnsafeHybridAggregationIterator.scala deleted file mode 100644 index 37d34eb7ccf0..000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/UnsafeHybridAggregationIterator.scala +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.aggregate - -import org.apache.spark.sql.execution.{UnsafeKeyValueSorter, UnsafeFixedWidthAggregationMap} -import org.apache.spark.unsafe.KVIterator -import org.apache.spark.{SparkEnv, TaskContext} -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate._ -import org.apache.spark.sql.types.StructType - -/** - * An iterator used to evaluate [[AggregateFunction2]]. - * It first tries to use in-memory hash-based aggregation. If we cannot allocate more - * space for the hash map, we spill the sorted map entries, free the map, and then - * switch to sort-based aggregation. - */ -class UnsafeHybridAggregationIterator( - groupingKeyAttributes: Seq[Attribute], - valueAttributes: Seq[Attribute], - inputKVIterator: KVIterator[UnsafeRow, InternalRow], - nonCompleteAggregateExpressions: Seq[AggregateExpression2], - nonCompleteAggregateAttributes: Seq[Attribute], - completeAggregateExpressions: Seq[AggregateExpression2], - completeAggregateAttributes: Seq[Attribute], - initialInputBufferOffset: Int, - resultExpressions: Seq[NamedExpression], - newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection), - outputsUnsafeRows: Boolean) - extends AggregationIterator( - groupingKeyAttributes, - valueAttributes, - nonCompleteAggregateExpressions, - nonCompleteAggregateAttributes, - completeAggregateExpressions, - completeAggregateAttributes, - initialInputBufferOffset, - resultExpressions, - newMutableProjection, - outputsUnsafeRows) { - - require(groupingKeyAttributes.nonEmpty) - - /////////////////////////////////////////////////////////////////////////// - // Unsafe Aggregation buffers - /////////////////////////////////////////////////////////////////////////// - - // This is the Unsafe Aggregation Map used to store all buffers. - private[this] val buffers = new UnsafeFixedWidthAggregationMap( - newBuffer, - StructType.fromAttributes(allAggregateFunctions.flatMap(_.bufferAttributes)), - StructType.fromAttributes(groupingKeyAttributes), - TaskContext.get.taskMemoryManager(), - SparkEnv.get.shuffleMemoryManager, - 1024 * 16, // initial capacity - SparkEnv.get.conf.getSizeAsBytes("spark.buffer.pageSize", "64m"), - false // disable tracking of performance metrics - ) - - override protected def newBuffer: UnsafeRow = { - val bufferSchema = allAggregateFunctions.flatMap(_.bufferAttributes) - val bufferRowSize: Int = bufferSchema.length - - val genericMutableBuffer = new GenericMutableRow(bufferRowSize) - val unsafeProjection = - UnsafeProjection.create(bufferSchema.map(_.dataType)) - val buffer = unsafeProjection.apply(genericMutableBuffer) - initializeBuffer(buffer) - buffer - } - - /////////////////////////////////////////////////////////////////////////// - // Methods and variables related to switching to sort-based aggregation - /////////////////////////////////////////////////////////////////////////// - private[this] var sortBased = false - - private[this] var sortBasedAggregationIterator: SortBasedAggregationIterator = _ - - // The value part of the input KV iterator is used to store original input values of - // aggregate functions, we need to convert them to aggregation buffers. - private def processOriginalInput( - firstKey: UnsafeRow, - firstValue: InternalRow): KVIterator[UnsafeRow, UnsafeRow] = { - new KVIterator[UnsafeRow, UnsafeRow] { - private[this] var isFirstRow = true - - private[this] var groupingKey: UnsafeRow = _ - - private[this] val buffer: UnsafeRow = newBuffer - - override def next(): Boolean = { - initializeBuffer(buffer) - if (isFirstRow) { - isFirstRow = false - groupingKey = firstKey - processRow(buffer, firstValue) - - true - } else if (inputKVIterator.next()) { - groupingKey = inputKVIterator.getKey() - val value = inputKVIterator.getValue() - processRow(buffer, value) - - true - } else { - false - } - } - - override def getKey(): UnsafeRow = { - groupingKey - } - - override def getValue(): UnsafeRow = { - buffer - } - - override def close(): Unit = { - // Do nothing. - } - } - } - - // The value of the input KV Iterator has the format of groupingExprs + aggregation buffer. - // We need to project the aggregation buffer out. - private def projectInputBufferToUnsafe( - firstKey: UnsafeRow, - firstValue: InternalRow): KVIterator[UnsafeRow, UnsafeRow] = { - new KVIterator[UnsafeRow, UnsafeRow] { - private[this] var isFirstRow = true - - private[this] var groupingKey: UnsafeRow = _ - - private[this] val bufferSchema = allAggregateFunctions.flatMap(_.bufferAttributes) - - private[this] val value: UnsafeRow = { - val genericMutableRow = new GenericMutableRow(bufferSchema.length) - UnsafeProjection.create(bufferSchema.map(_.dataType)).apply(genericMutableRow) - } - - private[this] val projectInputBuffer = { - newMutableProjection(bufferSchema, valueAttributes)().target(value) - } - - override def next(): Boolean = { - if (isFirstRow) { - isFirstRow = false - groupingKey = firstKey - projectInputBuffer(firstValue) - - true - } else if (inputKVIterator.next()) { - groupingKey = inputKVIterator.getKey() - projectInputBuffer(inputKVIterator.getValue()) - - true - } else { - false - } - } - - override def getKey(): UnsafeRow = { - groupingKey - } - - override def getValue(): UnsafeRow = { - value - } - - override def close(): Unit = { - // Do nothing. - } - } - } - - /** - * We need to fall back to sort based aggregation because we do not have enough memory - * for our in-memory hash map (i.e. `buffers`). - */ - private def switchToSortBasedAggregation( - currentGroupingKey: UnsafeRow, - currentRow: InternalRow): Unit = { - logInfo("falling back to sort based aggregation.") - - // Step 1: Get the ExternalSorter containing entries of the map. - val externalSorter = buffers.destructAndCreateExternalSorter() - - // Step 2: Free the memory used by the map. - buffers.free() - - // Step 3: If we have aggregate function with mode Partial or Complete, - // we need to process them to get aggregation buffer. - // So, later in the sort-based aggregation iterator, we can do merge. - // If aggregate functions are with mode Final and PartialMerge, - // we just need to project the aggregation buffer from the input. - val needsProcess = aggregationMode match { - case (Some(Partial), None) => true - case (None, Some(Complete)) => true - case (Some(Final), Some(Complete)) => true - case _ => false - } - - val processedIterator = if (needsProcess) { - processOriginalInput(currentGroupingKey, currentRow) - } else { - // The input value's format is groupingExprs + buffer. - // We need to project the buffer part out. - projectInputBufferToUnsafe(currentGroupingKey, currentRow) - } - - // Step 4: Redirect processedIterator to externalSorter. - while (processedIterator.next()) { - externalSorter.insertKV(processedIterator.getKey(), processedIterator.getValue()) - } - - // Step 5: Get the sorted iterator from the externalSorter. - val sortedKVIterator: KVIterator[UnsafeRow, UnsafeRow] = externalSorter.sortedIterator() - - // Step 6: We now create a SortBasedAggregationIterator based on sortedKVIterator. - // For a aggregate function with mode Partial, its mode in the SortBasedAggregationIterator - // will be PartialMerge. For a aggregate function with mode Complete, - // its mode in the SortBasedAggregationIterator will be Final. - val newNonCompleteAggregateExpressions = allAggregateExpressions.map { - case AggregateExpression2(func, Partial, isDistinct) => - AggregateExpression2(func, PartialMerge, isDistinct) - case AggregateExpression2(func, Complete, isDistinct) => - AggregateExpression2(func, Final, isDistinct) - case other => other - } - val newNonCompleteAggregateAttributes = - nonCompleteAggregateAttributes ++ completeAggregateAttributes - - val newValueAttributes = - allAggregateExpressions.flatMap(_.aggregateFunction.cloneBufferAttributes) - - sortBasedAggregationIterator = SortBasedAggregationIterator.createFromKVIterator( - groupingKeyAttributes = groupingKeyAttributes, - valueAttributes = newValueAttributes, - inputKVIterator = sortedKVIterator.asInstanceOf[KVIterator[InternalRow, InternalRow]], - nonCompleteAggregateExpressions = newNonCompleteAggregateExpressions, - nonCompleteAggregateAttributes = newNonCompleteAggregateAttributes, - completeAggregateExpressions = Nil, - completeAggregateAttributes = Nil, - initialInputBufferOffset = 0, - resultExpressions = resultExpressions, - newMutableProjection = newMutableProjection, - outputsUnsafeRows = outputsUnsafeRows) - } - - /////////////////////////////////////////////////////////////////////////// - // Methods used to initialize this iterator. - /////////////////////////////////////////////////////////////////////////// - - /** Starts to read input rows and falls back to sort-based aggregation if necessary. */ - protected def initialize(): Unit = { - var hasNext = inputKVIterator.next() - while (!sortBased && hasNext) { - val groupingKey = inputKVIterator.getKey() - val currentRow = inputKVIterator.getValue() - val buffer = buffers.getAggregationBuffer(groupingKey) - if (buffer == null) { - // buffer == null means that we could not allocate more memory. - // Now, we need to spill the map and switch to sort-based aggregation. - switchToSortBasedAggregation(groupingKey, currentRow) - sortBased = true - } else { - processRow(buffer, currentRow) - hasNext = inputKVIterator.next() - } - } - } - - // This is the starting point of this iterator. - initialize() - - // Creates the iterator for the Hash Aggregation Map after we have populated - // contents of that map. - private[this] val aggregationBufferMapIterator = buffers.iterator() - - private[this] var _mapIteratorHasNext = false - - // Pre-load the first key-value pair from the map to make hasNext idempotent. - if (!sortBased) { - _mapIteratorHasNext = aggregationBufferMapIterator.next() - // If the map is empty, we just free it. - if (!_mapIteratorHasNext) { - buffers.free() - } - } - - /////////////////////////////////////////////////////////////////////////// - // Iterator's public methods - /////////////////////////////////////////////////////////////////////////// - - override final def hasNext: Boolean = { - (sortBased && sortBasedAggregationIterator.hasNext) || (!sortBased && _mapIteratorHasNext) - } - - - override final def next(): InternalRow = { - if (hasNext) { - if (sortBased) { - sortBasedAggregationIterator.next() - } else { - // We did not fall back to the sort-based aggregation. - val result = - generateOutput( - aggregationBufferMapIterator.getKey, - aggregationBufferMapIterator.getValue) - // Pre-load next key-value pair form aggregationBufferMapIterator. - _mapIteratorHasNext = aggregationBufferMapIterator.next() - - if (!_mapIteratorHasNext) { - val resultCopy = result.copy() - buffers.free() - resultCopy - } else { - result - } - } - } else { - // no more result - throw new NoSuchElementException - } - } -} - -object UnsafeHybridAggregationIterator { - // scalastyle:off - def createFromInputIterator( - groupingExprs: Seq[NamedExpression], - nonCompleteAggregateExpressions: Seq[AggregateExpression2], - nonCompleteAggregateAttributes: Seq[Attribute], - completeAggregateExpressions: Seq[AggregateExpression2], - completeAggregateAttributes: Seq[Attribute], - initialInputBufferOffset: Int, - resultExpressions: Seq[NamedExpression], - newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection), - inputAttributes: Seq[Attribute], - inputIter: Iterator[InternalRow], - outputsUnsafeRows: Boolean): UnsafeHybridAggregationIterator = { - new UnsafeHybridAggregationIterator( - groupingExprs.map(_.toAttribute), - inputAttributes, - AggregationIterator.unsafeKVIterator(groupingExprs, inputAttributes, inputIter), - nonCompleteAggregateExpressions, - nonCompleteAggregateAttributes, - completeAggregateExpressions, - completeAggregateAttributes, - initialInputBufferOffset, - resultExpressions, - newMutableProjection, - outputsUnsafeRows) - } - - def createFromKVIterator( - groupingKeyAttributes: Seq[Attribute], - valueAttributes: Seq[Attribute], - inputKVIterator: KVIterator[UnsafeRow, InternalRow], - nonCompleteAggregateExpressions: Seq[AggregateExpression2], - nonCompleteAggregateAttributes: Seq[Attribute], - completeAggregateExpressions: Seq[AggregateExpression2], - completeAggregateAttributes: Seq[Attribute], - initialInputBufferOffset: Int, - resultExpressions: Seq[NamedExpression], - newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection), - outputsUnsafeRows: Boolean): UnsafeHybridAggregationIterator = { - new UnsafeHybridAggregationIterator( - groupingKeyAttributes, - valueAttributes, - inputKVIterator, - nonCompleteAggregateExpressions, - nonCompleteAggregateAttributes, - completeAggregateExpressions, - completeAggregateAttributes, - initialInputBufferOffset, - resultExpressions, - newMutableProjection, - outputsUnsafeRows) - } - // scalastyle:on -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala index 5fafc916bfa0..d43d3dd9ffaa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala @@ -304,7 +304,7 @@ private[sql] case class ScalaUDAF( override def nullable: Boolean = true - override def dataType: DataType = udaf.returnDataType + override def dataType: DataType = udaf.dataType override def deterministic: Boolean = udaf.deterministic @@ -316,7 +316,7 @@ private[sql] case class ScalaUDAF( override lazy val cloneBufferAttributes = bufferAttributes.map(_.newInstance()) - private[this] val childrenSchema: StructType = { + private[this] lazy val childrenSchema: StructType = { val inputFields = children.zipWithIndex.map { case (child, index) => StructField(s"input$index", child.dataType, child.nullable, Metadata.empty) @@ -337,16 +337,16 @@ private[sql] case class ScalaUDAF( } } - private[this] val inputToScalaConverters: Any => Any = + private[this] lazy val inputToScalaConverters: Any => Any = CatalystTypeConverters.createToScalaConverter(childrenSchema) - private[this] val bufferValuesToCatalystConverters: Array[Any => Any] = { + private[this] lazy val bufferValuesToCatalystConverters: Array[Any => Any] = { bufferSchema.fields.map { field => CatalystTypeConverters.createToCatalystConverter(field.dataType) } } - private[this] val bufferValuesToScalaConverters: Array[Any => Any] = { + private[this] lazy val bufferValuesToScalaConverters: Array[Any => Any] = { bufferSchema.fields.map { field => CatalystTypeConverters.createToScalaConverter(field.dataType) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala index 960be08f84d9..80816a095ea8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala @@ -17,20 +17,41 @@ package org.apache.spark.sql.execution.aggregate +import scala.collection.mutable + import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ -import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, SparkPlan} +import org.apache.spark.sql.types.StructType /** * Utility functions used by the query planner to convert our plan to new aggregation code path. */ object Utils { + def supportsTungstenAggregate( + groupingExpressions: Seq[Expression], + aggregateBufferAttributes: Seq[Attribute]): Boolean = { + val aggregationBufferSchema = StructType.fromAttributes(aggregateBufferAttributes) + + UnsafeFixedWidthAggregationMap.supportsAggregationBufferSchema(aggregationBufferSchema) && + UnsafeProjection.canSupport(groupingExpressions) + } + def planAggregateWithoutDistinct( groupingExpressions: Seq[Expression], aggregateExpressions: Seq[AggregateExpression2], - aggregateFunctionMap: Map[(AggregateFunction2, Boolean), Attribute], + aggregateFunctionMap: Map[(AggregateFunction2, Boolean), (AggregateFunction2, Attribute)], resultExpressions: Seq[NamedExpression], child: SparkPlan): Seq[SparkPlan] = { + // Check if we can use TungstenAggregate. + val usesTungstenAggregate = + child.sqlContext.conf.unsafeEnabled && + aggregateExpressions.forall(_.aggregateFunction.isInstanceOf[AlgebraicAggregate]) && + supportsTungstenAggregate( + groupingExpressions, + aggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes)) + + // 1. Create an Aggregate Operator for partial aggregations. val namedGroupingExpressions = groupingExpressions.map { case ne: NamedExpression => ne -> ne @@ -44,11 +65,23 @@ object Utils { val groupExpressionMap = namedGroupingExpressions.toMap val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute) val partialAggregateExpressions = aggregateExpressions.map(_.copy(mode = Partial)) - val partialAggregateAttributes = partialAggregateExpressions.flatMap { agg => - agg.aggregateFunction.bufferAttributes - } - val partialAggregate = - Aggregate( + val partialAggregateAttributes = + partialAggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes) + val partialResultExpressions = + namedGroupingAttributes ++ + partialAggregateExpressions.flatMap(_.aggregateFunction.cloneBufferAttributes) + + val partialAggregate = if (usesTungstenAggregate) { + TungstenAggregate( + requiredChildDistributionExpressions = None: Option[Seq[Expression]], + groupingExpressions = namedGroupingExpressions.map(_._2), + nonCompleteAggregateExpressions = partialAggregateExpressions, + completeAggregateExpressions = Nil, + initialInputBufferOffset = 0, + resultExpressions = partialResultExpressions, + child = child) + } else { + SortBasedAggregate( requiredChildDistributionExpressions = None: Option[Seq[Expression]], groupingExpressions = namedGroupingExpressions.map(_._2), nonCompleteAggregateExpressions = partialAggregateExpressions, @@ -56,29 +89,57 @@ object Utils { completeAggregateExpressions = Nil, completeAggregateAttributes = Nil, initialInputBufferOffset = 0, - resultExpressions = namedGroupingAttributes ++ partialAggregateAttributes, + resultExpressions = partialResultExpressions, child = child) + } // 2. Create an Aggregate Operator for final aggregations. val finalAggregateExpressions = aggregateExpressions.map(_.copy(mode = Final)) val finalAggregateAttributes = finalAggregateExpressions.map { - expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct) + expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)._2 } - val rewrittenResultExpressions = resultExpressions.map { expr => - expr.transformDown { - case agg: AggregateExpression2 => - aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct).toAttribute - case expression => - // We do not rely on the equality check at here since attributes may - // different cosmetically. Instead, we use semanticEquals. - groupExpressionMap.collectFirst { - case (expr, ne) if expr semanticEquals expression => ne.toAttribute - }.getOrElse(expression) - }.asInstanceOf[NamedExpression] - } - val finalAggregate = - Aggregate( + + val finalAggregate = if (usesTungstenAggregate) { + val rewrittenResultExpressions = resultExpressions.map { expr => + expr.transformDown { + case agg: AggregateExpression2 => + // aggregateFunctionMap contains unique aggregate functions. + val aggregateFunction = + aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._1 + aggregateFunction.asInstanceOf[AlgebraicAggregate].evaluateExpression + case expression => + // We do not rely on the equality check at here since attributes may + // different cosmetically. Instead, we use semanticEquals. + groupExpressionMap.collectFirst { + case (expr, ne) if expr semanticEquals expression => ne.toAttribute + }.getOrElse(expression) + }.asInstanceOf[NamedExpression] + } + + TungstenAggregate( + requiredChildDistributionExpressions = Some(namedGroupingAttributes), + groupingExpressions = namedGroupingAttributes, + nonCompleteAggregateExpressions = finalAggregateExpressions, + completeAggregateExpressions = Nil, + initialInputBufferOffset = namedGroupingAttributes.length, + resultExpressions = rewrittenResultExpressions, + child = partialAggregate) + } else { + val rewrittenResultExpressions = resultExpressions.map { expr => + expr.transformDown { + case agg: AggregateExpression2 => + aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._2 + case expression => + // We do not rely on the equality check at here since attributes may + // different cosmetically. Instead, we use semanticEquals. + groupExpressionMap.collectFirst { + case (expr, ne) if expr semanticEquals expression => ne.toAttribute + }.getOrElse(expression) + }.asInstanceOf[NamedExpression] + } + + SortBasedAggregate( requiredChildDistributionExpressions = Some(namedGroupingAttributes), groupingExpressions = namedGroupingAttributes, nonCompleteAggregateExpressions = finalAggregateExpressions, @@ -88,6 +149,7 @@ object Utils { initialInputBufferOffset = namedGroupingAttributes.length, resultExpressions = rewrittenResultExpressions, child = partialAggregate) + } finalAggregate :: Nil } @@ -96,10 +158,18 @@ object Utils { groupingExpressions: Seq[Expression], functionsWithDistinct: Seq[AggregateExpression2], functionsWithoutDistinct: Seq[AggregateExpression2], - aggregateFunctionMap: Map[(AggregateFunction2, Boolean), Attribute], + aggregateFunctionMap: Map[(AggregateFunction2, Boolean), (AggregateFunction2, Attribute)], resultExpressions: Seq[NamedExpression], child: SparkPlan): Seq[SparkPlan] = { + val aggregateExpressions = functionsWithDistinct ++ functionsWithoutDistinct + val usesTungstenAggregate = + child.sqlContext.conf.unsafeEnabled && + aggregateExpressions.forall(_.aggregateFunction.isInstanceOf[AlgebraicAggregate]) && + supportsTungstenAggregate( + groupingExpressions, + aggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes)) + // 1. Create an Aggregate Operator for partial aggregations. // The grouping expressions are original groupingExpressions and // distinct columns. For example, for avg(distinct value) ... group by key @@ -129,19 +199,26 @@ object Utils { val distinctColumnExpressionMap = namedDistinctColumnExpressions.toMap val distinctColumnAttributes = namedDistinctColumnExpressions.map(_._2.toAttribute) - val partialAggregateExpressions = functionsWithoutDistinct.map { - case AggregateExpression2(aggregateFunction, mode, _) => - AggregateExpression2(aggregateFunction, Partial, false) - } - val partialAggregateAttributes = partialAggregateExpressions.flatMap { agg => - agg.aggregateFunction.bufferAttributes - } + val partialAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Partial)) + val partialAggregateAttributes = + partialAggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes) val partialAggregateGroupingExpressions = (namedGroupingExpressions ++ namedDistinctColumnExpressions).map(_._2) val partialAggregateResult = - namedGroupingAttributes ++ distinctColumnAttributes ++ partialAggregateAttributes - val partialAggregate = - Aggregate( + namedGroupingAttributes ++ + distinctColumnAttributes ++ + partialAggregateExpressions.flatMap(_.aggregateFunction.cloneBufferAttributes) + val partialAggregate = if (usesTungstenAggregate) { + TungstenAggregate( + requiredChildDistributionExpressions = None: Option[Seq[Expression]], + groupingExpressions = partialAggregateGroupingExpressions, + nonCompleteAggregateExpressions = partialAggregateExpressions, + completeAggregateExpressions = Nil, + initialInputBufferOffset = 0, + resultExpressions = partialAggregateResult, + child = child) + } else { + SortBasedAggregate( requiredChildDistributionExpressions = None: Option[Seq[Expression]], groupingExpressions = partialAggregateGroupingExpressions, nonCompleteAggregateExpressions = partialAggregateExpressions, @@ -151,20 +228,27 @@ object Utils { initialInputBufferOffset = 0, resultExpressions = partialAggregateResult, child = child) + } // 2. Create an Aggregate Operator for partial merge aggregations. - val partialMergeAggregateExpressions = functionsWithoutDistinct.map { - case AggregateExpression2(aggregateFunction, mode, _) => - AggregateExpression2(aggregateFunction, PartialMerge, false) - } + val partialMergeAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = PartialMerge)) val partialMergeAggregateAttributes = - partialMergeAggregateExpressions.flatMap { agg => - agg.aggregateFunction.bufferAttributes - } + partialMergeAggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes) val partialMergeAggregateResult = - namedGroupingAttributes ++ distinctColumnAttributes ++ partialMergeAggregateAttributes - val partialMergeAggregate = - Aggregate( + namedGroupingAttributes ++ + distinctColumnAttributes ++ + partialMergeAggregateExpressions.flatMap(_.aggregateFunction.cloneBufferAttributes) + val partialMergeAggregate = if (usesTungstenAggregate) { + TungstenAggregate( + requiredChildDistributionExpressions = Some(namedGroupingAttributes), + groupingExpressions = namedGroupingAttributes ++ distinctColumnAttributes, + nonCompleteAggregateExpressions = partialMergeAggregateExpressions, + completeAggregateExpressions = Nil, + initialInputBufferOffset = (namedGroupingAttributes ++ distinctColumnAttributes).length, + resultExpressions = partialMergeAggregateResult, + child = partialAggregate) + } else { + SortBasedAggregate( requiredChildDistributionExpressions = Some(namedGroupingAttributes), groupingExpressions = namedGroupingAttributes ++ distinctColumnAttributes, nonCompleteAggregateExpressions = partialMergeAggregateExpressions, @@ -174,48 +258,91 @@ object Utils { initialInputBufferOffset = (namedGroupingAttributes ++ distinctColumnAttributes).length, resultExpressions = partialMergeAggregateResult, child = partialAggregate) + } // 3. Create an Aggregate Operator for partial merge aggregations. - val finalAggregateExpressions = functionsWithoutDistinct.map { - case AggregateExpression2(aggregateFunction, mode, _) => - AggregateExpression2(aggregateFunction, Final, false) - } + val finalAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Final)) val finalAggregateAttributes = finalAggregateExpressions.map { - expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct) + expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)._2 } + // Create a map to store those rewritten aggregate functions. We always need to use + // both function and its corresponding isDistinct flag as the key because function itself + // does not knows if it is has distinct keyword or now. + val rewrittenAggregateFunctions = + mutable.Map.empty[(AggregateFunction2, Boolean), AggregateFunction2] val (completeAggregateExpressions, completeAggregateAttributes) = functionsWithDistinct.map { // Children of an AggregateFunction with DISTINCT keyword has already // been evaluated. At here, we need to replace original children // to AttributeReferences. - case agg @ AggregateExpression2(aggregateFunction, mode, isDistinct) => + case agg @ AggregateExpression2(aggregateFunction, mode, true) => val rewrittenAggregateFunction = aggregateFunction.transformDown { case expr if distinctColumnExpressionMap.contains(expr) => distinctColumnExpressionMap(expr).toAttribute }.asInstanceOf[AggregateFunction2] + // Because we have rewritten the aggregate function, we use rewrittenAggregateFunctions + // to track the old version and the new version of this function. + rewrittenAggregateFunctions += (aggregateFunction, true) -> rewrittenAggregateFunction // We rewrite the aggregate function to a non-distinct aggregation because // its input will have distinct arguments. + // We just keep the isDistinct setting to true, so when users look at the query plan, + // they still can see distinct aggregations. val rewrittenAggregateExpression = - AggregateExpression2(rewrittenAggregateFunction, Complete, false) + AggregateExpression2(rewrittenAggregateFunction, Complete, true) - val aggregateFunctionAttribute = aggregateFunctionMap(agg.aggregateFunction, isDistinct) + val aggregateFunctionAttribute = + aggregateFunctionMap(agg.aggregateFunction, true)._2 (rewrittenAggregateExpression -> aggregateFunctionAttribute) }.unzip - val rewrittenResultExpressions = resultExpressions.map { expr => - expr.transform { - case agg: AggregateExpression2 => - aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct).toAttribute - case expression => - // We do not rely on the equality check at here since attributes may - // different cosmetically. Instead, we use semanticEquals. - groupExpressionMap.collectFirst { - case (expr, ne) if expr semanticEquals expression => ne.toAttribute - }.getOrElse(expression) - }.asInstanceOf[NamedExpression] - } - val finalAndCompleteAggregate = - Aggregate( + val finalAndCompleteAggregate = if (usesTungstenAggregate) { + val rewrittenResultExpressions = resultExpressions.map { expr => + expr.transform { + case agg: AggregateExpression2 => + val function = agg.aggregateFunction + val isDistinct = agg.isDistinct + val aggregateFunction = + if (rewrittenAggregateFunctions.contains(function, isDistinct)) { + // If this function has been rewritten, we get the rewritten version from + // rewrittenAggregateFunctions. + rewrittenAggregateFunctions(function, isDistinct) + } else { + // Oterwise, we get it from aggregateFunctionMap, which contains unique + // aggregate functions that have not been rewritten. + aggregateFunctionMap(function, isDistinct)._1 + } + aggregateFunction.asInstanceOf[AlgebraicAggregate].evaluateExpression + case expression => + // We do not rely on the equality check at here since attributes may + // different cosmetically. Instead, we use semanticEquals. + groupExpressionMap.collectFirst { + case (expr, ne) if expr semanticEquals expression => ne.toAttribute + }.getOrElse(expression) + }.asInstanceOf[NamedExpression] + } + + TungstenAggregate( + requiredChildDistributionExpressions = Some(namedGroupingAttributes), + groupingExpressions = namedGroupingAttributes, + nonCompleteAggregateExpressions = finalAggregateExpressions, + completeAggregateExpressions = completeAggregateExpressions, + initialInputBufferOffset = (namedGroupingAttributes ++ distinctColumnAttributes).length, + resultExpressions = rewrittenResultExpressions, + child = partialMergeAggregate) + } else { + val rewrittenResultExpressions = resultExpressions.map { expr => + expr.transform { + case agg: AggregateExpression2 => + aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._2 + case expression => + // We do not rely on the equality check at here since attributes may + // different cosmetically. Instead, we use semanticEquals. + groupExpressionMap.collectFirst { + case (expr, ne) if expr semanticEquals expression => ne.toAttribute + }.getOrElse(expression) + }.asInstanceOf[NamedExpression] + } + SortBasedAggregate( requiredChildDistributionExpressions = Some(namedGroupingAttributes), groupingExpressions = namedGroupingAttributes, nonCompleteAggregateExpressions = finalAggregateExpressions, @@ -225,6 +352,7 @@ object Utils { initialInputBufferOffset = (namedGroupingAttributes ++ distinctColumnAttributes).length, resultExpressions = rewrittenResultExpressions, child = partialMergeAggregate) + } finalAndCompleteAggregate :: Nil } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala index 5a1b000e8987..3f68b05a24f4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.rdd.{RDD, ShuffledRDD} +import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow @@ -26,9 +26,11 @@ import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.ExternalSorter import org.apache.spark.util.collection.unsafe.sort.PrefixComparator +import org.apache.spark.util.random.PoissonSampler import org.apache.spark.util.{CompletionIterator, MutablePair} import org.apache.spark.{HashPartitioner, SparkEnv} @@ -39,11 +41,20 @@ import org.apache.spark.{HashPartitioner, SparkEnv} case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryNode { override def output: Seq[Attribute] = projectList.map(_.toAttribute) + override private[sql] lazy val metrics = Map( + "numRows" -> SQLMetrics.createLongMetric(sparkContext, "number of rows")) + @transient lazy val buildProjection = newMutableProjection(projectList, child.output) - protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter => - val reusableProjection = buildProjection() - iter.map(reusableProjection) + protected override def doExecute(): RDD[InternalRow] = { + val numRows = longMetric("numRows") + child.execute().mapPartitions { iter => + val reusableProjection = buildProjection() + iter.map { row => + numRows += 1 + reusableProjection(row) + } + } } override def outputOrdering: Seq[SortOrder] = child.outputOrdering @@ -55,19 +66,30 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends */ case class TungstenProject(projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryNode { + override private[sql] lazy val metrics = Map( + "numRows" -> SQLMetrics.createLongMetric(sparkContext, "number of rows")) + override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = true override def output: Seq[Attribute] = projectList.map(_.toAttribute) - protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter => - this.transformAllExpressions { - case CreateStruct(children) => CreateStructUnsafe(children) - case CreateNamedStruct(children) => CreateNamedStructUnsafe(children) + /** Rewrite the project list to use unsafe expressions as needed. */ + protected val unsafeProjectList = projectList.map(_ transform { + case CreateStruct(children) => CreateStructUnsafe(children) + case CreateNamedStruct(children) => CreateNamedStructUnsafe(children) + }) + + protected override def doExecute(): RDD[InternalRow] = { + val numRows = longMetric("numRows") + child.execute().mapPartitions { iter => + val project = UnsafeProjection.create(unsafeProjectList, child.output) + iter.map { row => + numRows += 1 + project(row) + } } - val project = UnsafeProjection.create(projectList, child.output) - iter.map(project) } override def outputOrdering: Seq[SortOrder] = child.outputOrdering @@ -81,8 +103,22 @@ case class TungstenProject(projectList: Seq[NamedExpression], child: SparkPlan) case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output - protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter => - iter.filter(newPredicate(condition, child.output)) + private[sql] override lazy val metrics = Map( + "numInputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of input rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + + protected override def doExecute(): RDD[InternalRow] = { + val numInputRows = longMetric("numInputRows") + val numOutputRows = longMetric("numOutputRows") + child.execute().mapPartitions { iter => + val predicate = newPredicate(condition, child.output) + iter.filter { row => + numInputRows += 1 + val r = predicate(row) + if (r) numOutputRows += 1 + r + } + } } override def outputOrdering: Seq[SortOrder] = child.outputOrdering @@ -115,12 +151,21 @@ case class Sample( { override def output: Seq[Attribute] = child.output - // TODO: How to pick seed? + override def outputsUnsafeRows: Boolean = child.outputsUnsafeRows + override def canProcessUnsafeRows: Boolean = true + override def canProcessSafeRows: Boolean = true + protected override def doExecute(): RDD[InternalRow] = { if (withReplacement) { - child.execute().map(_.copy()).sample(withReplacement, upperBound - lowerBound, seed) + // Disable gap sampling since the gap sampling method buffers two rows internally, + // requiring us to copy the row, which is more expensive than the random number generator. + new PartitionwiseSampledRDD[InternalRow, InternalRow]( + child.execute(), + new PoissonSampler[InternalRow](upperBound - lowerBound, useGapSamplingIfPossible = false), + preservesPartitioning = true, + seed) } else { - child.execute().map(_.copy()).randomSampleWithRange(lowerBound, upperBound, seed) + child.execute().randomSampleWithRange(lowerBound, upperBound, seed) } } } @@ -194,11 +239,16 @@ case class TakeOrderedAndProject( projectList: Option[Seq[NamedExpression]], child: SparkPlan) extends UnaryNode { - override def output: Seq[Attribute] = child.output + override def output: Seq[Attribute] = { + val projectOutput = projectList.map(_.map(_.toAttribute)) + projectOutput.getOrElse(child.output) + } override def outputPartitioning: Partitioning = SinglePartition - private val ord: RowOrdering = new RowOrdering(sortOrder, child.output) + // We need to use an interpreted ordering here because generated orderings cannot be serialized + // and this ordering needs to be created on the driver in order to be passed into Spark core code. + private val ord: InterpretedOrdering = new InterpretedOrdering(sortOrder, child.output) // TODO: remove @transient after figure out how to clean closure at InsertIntoHiveTable. @transient private val projection = projectList.map(new InterpretedProjection(_, child.output)) @@ -218,6 +268,13 @@ case class TakeOrderedAndProject( protected override def doExecute(): RDD[InternalRow] = sparkContext.makeRDD(collectData(), 1) override def outputOrdering: Seq[SortOrder] = sortOrder + + override def simpleString: String = { + val orderByString = sortOrder.mkString("[", ",", "]") + val outputString = output.mkString("[", ",", "]") + + s"TakeOrderedAndProject(limit=$limit, orderBy=$orderByString, output=$outputString)" + } } /** @@ -229,6 +286,11 @@ case class Repartition(numPartitions: Int, shuffle: Boolean, child: SparkPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output + override def outputPartitioning: Partitioning = { + if (numPartitions == 1) SinglePartition + else UnknownPartitioning(numPartitions) + } + protected override def doExecute(): RDD[InternalRow] = { child.execute().map(_.copy()).coalesce(numPartitions, shuffle) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala index 6b83025d5a15..95209e663451 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -69,6 +69,8 @@ private[sql] case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan val converted = sideEffectResult.map(convert(_).asInstanceOf[InternalRow]) sqlContext.sparkContext.parallelize(converted, 1) } + + override def argString: String = cmd.toString } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala new file mode 100644 index 000000000000..f7a88b98c0b4 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala @@ -0,0 +1,185 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql.execution.datasources + +import scala.language.implicitConversions +import scala.util.matching.Regex + +import org.apache.spark.Logging +import org.apache.spark.sql.SaveMode +import org.apache.spark.sql.catalyst.{TableIdentifier, AbstractSparkSQLParser} +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.types._ + + +/** + * A parser for foreign DDL commands. + */ +class DDLParser(parseQuery: String => LogicalPlan) + extends AbstractSparkSQLParser with DataTypeParser with Logging { + + def parse(input: String, exceptionOnError: Boolean): LogicalPlan = { + try { + parse(input) + } catch { + case ddlException: DDLException => throw ddlException + case _ if !exceptionOnError => parseQuery(input) + case x: Throwable => throw x + } + } + + // Keyword is a convention with AbstractSparkSQLParser, which will scan all of the `Keyword` + // properties via reflection the class in runtime for constructing the SqlLexical object + protected val CREATE = Keyword("CREATE") + protected val TEMPORARY = Keyword("TEMPORARY") + protected val TABLE = Keyword("TABLE") + protected val IF = Keyword("IF") + protected val NOT = Keyword("NOT") + protected val EXISTS = Keyword("EXISTS") + protected val USING = Keyword("USING") + protected val OPTIONS = Keyword("OPTIONS") + protected val DESCRIBE = Keyword("DESCRIBE") + protected val EXTENDED = Keyword("EXTENDED") + protected val AS = Keyword("AS") + protected val COMMENT = Keyword("COMMENT") + protected val REFRESH = Keyword("REFRESH") + + protected lazy val ddl: Parser[LogicalPlan] = createTable | describeTable | refreshTable + + protected def start: Parser[LogicalPlan] = ddl + + /** + * `CREATE [TEMPORARY] TABLE avroTable [IF NOT EXISTS] + * USING org.apache.spark.sql.avro + * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")` + * or + * `CREATE [TEMPORARY] TABLE avroTable(intField int, stringField string...) [IF NOT EXISTS] + * USING org.apache.spark.sql.avro + * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")` + * or + * `CREATE [TEMPORARY] TABLE avroTable [IF NOT EXISTS] + * USING org.apache.spark.sql.avro + * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")` + * AS SELECT ... + */ + protected lazy val createTable: Parser[LogicalPlan] = { + // TODO: Support database.table. + (CREATE ~> TEMPORARY.? <~ TABLE) ~ (IF ~> NOT <~ EXISTS).? ~ tableIdentifier ~ + tableCols.? ~ (USING ~> className) ~ (OPTIONS ~> options).? ~ (AS ~> restInput).? ^^ { + case temp ~ allowExisting ~ tableIdent ~ columns ~ provider ~ opts ~ query => + if (temp.isDefined && allowExisting.isDefined) { + throw new DDLException( + "a CREATE TEMPORARY TABLE statement does not allow IF NOT EXISTS clause.") + } + + val options = opts.getOrElse(Map.empty[String, String]) + if (query.isDefined) { + if (columns.isDefined) { + throw new DDLException( + "a CREATE TABLE AS SELECT statement does not allow column definitions.") + } + // When IF NOT EXISTS clause appears in the query, the save mode will be ignore. + val mode = if (allowExisting.isDefined) { + SaveMode.Ignore + } else if (temp.isDefined) { + SaveMode.Overwrite + } else { + SaveMode.ErrorIfExists + } + + val queryPlan = parseQuery(query.get) + CreateTableUsingAsSelect(tableIdent, + provider, + temp.isDefined, + Array.empty[String], + mode, + options, + queryPlan) + } else { + val userSpecifiedSchema = columns.flatMap(fields => Some(StructType(fields))) + CreateTableUsing( + tableIdent, + userSpecifiedSchema, + provider, + temp.isDefined, + options, + allowExisting.isDefined, + managedIfNoPath = false) + } + } + } + + // This is the same as tableIdentifier in SqlParser. + protected lazy val tableIdentifier: Parser[TableIdentifier] = + (ident <~ ".").? ~ ident ^^ { + case maybeDbName ~ tableName => TableIdentifier(tableName, maybeDbName) + } + + protected lazy val tableCols: Parser[Seq[StructField]] = "(" ~> repsep(column, ",") <~ ")" + + /* + * describe [extended] table avroTable + * This will display all columns of table `avroTable` includes column_name,column_type,comment + */ + protected lazy val describeTable: Parser[LogicalPlan] = + (DESCRIBE ~> opt(EXTENDED)) ~ tableIdentifier ^^ { + case e ~ tableIdent => + DescribeCommand(UnresolvedRelation(tableIdent.toSeq, None), e.isDefined) + } + + protected lazy val refreshTable: Parser[LogicalPlan] = + REFRESH ~> TABLE ~> tableIdentifier ^^ { + case tableIndet => + RefreshTable(tableIndet) + } + + protected lazy val options: Parser[Map[String, String]] = + "(" ~> repsep(pair, ",") <~ ")" ^^ { case s: Seq[(String, String)] => s.toMap } + + protected lazy val className: Parser[String] = repsep(ident, ".") ^^ { case s => s.mkString(".")} + + override implicit def regexToParser(regex: Regex): Parser[String] = acceptMatch( + s"identifier matching regex $regex", { + case lexical.Identifier(str) if regex.unapplySeq(str).isDefined => str + case lexical.Keyword(str) if regex.unapplySeq(str).isDefined => str + } + ) + + protected lazy val optionPart: Parser[String] = "[_a-zA-Z][_a-zA-Z0-9]*".r ^^ { + case name => name + } + + protected lazy val optionName: Parser[String] = repsep(optionPart, ".") ^^ { + case parts => parts.mkString(".") + } + + protected lazy val pair: Parser[(String, String)] = + optionName ~ stringLit ^^ { case k ~ v => (k, v) } + + protected lazy val column: Parser[StructField] = + ident ~ dataType ~ (COMMENT ~> stringLit).? ^^ { case columnName ~ typ ~ cm => + val meta = cm match { + case Some(comment) => + new MetadataBuilder().putString(COMMENT.str.toLowerCase, comment).build() + case None => Metadata.empty + } + + StructField(columnName, typ, nullable = true, meta) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 6b91e51ca52f..c58213155daa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -20,11 +20,13 @@ package org.apache.spark.sql.execution.datasources import org.apache.spark.{Logging, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} -import org.apache.spark.sql.catalyst.{InternalRow, expressions} +import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _} @@ -99,8 +101,9 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { (a, f) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f, t.paths, confBroadcast))) :: Nil - case l @ LogicalRelation(t: TableScan) => - execution.PhysicalRDD(l.output, toCatalystRDD(l, t.buildScan())) :: Nil + case l @ LogicalRelation(baseRelation: TableScan) => + execution.PhysicalRDD.createFromDataSource( + l.output, toCatalystRDD(l, baseRelation.buildScan()), baseRelation) :: Nil case i @ logical.InsertIntoTable( l @ LogicalRelation(t: InsertableRelation), part, query, overwrite, false) if part.isEmpty => @@ -119,7 +122,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { projections: Seq[NamedExpression], filters: Seq[Expression], partitionColumns: StructType, - partitions: Array[Partition]) = { + partitions: Array[Partition]): SparkPlan = { val relation = logicalRelation.relation.asInstanceOf[HadoopFsRelation] // Because we are creating one RDD per partition, we need to have a shared HadoopConf. @@ -128,46 +131,51 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { val confBroadcast = relation.sqlContext.sparkContext.broadcast(new SerializableConfiguration(sharedHadoopConf)) - // Builds RDD[Row]s for each selected partition. - val perPartitionRows = partitions.map { case Partition(partitionValues, dir) => - // The table scan operator (PhysicalRDD) which retrieves required columns from data files. - // Notice that the schema of data files, represented by `relation.dataSchema`, may contain - // some partition column(s). - val scan = - pruneFilterProject( - logicalRelation, - projections, - filters, - (columns: Seq[Attribute], filters) => { - val partitionColNames = partitionColumns.fieldNames - - // Don't scan any partition columns to save I/O. Here we are being optimistic and - // assuming partition columns data stored in data files are always consistent with those - // partition values encoded in partition directory paths. - val needed = columns.filterNot(a => partitionColNames.contains(a.name)) - val dataRows = - relation.buildScan(needed.map(_.name).toArray, filters, Array(dir), confBroadcast) - - // Merges data values with partition values. - mergeWithPartitionValues( - relation.schema, - columns.map(_.name).toArray, - partitionColNames, - partitionValues, - toCatalystRDD(logicalRelation, needed, dataRows)) - }) - - scan.execute() - } + // Now, we create a scan builder, which will be used by pruneFilterProject. This scan builder + // will union all partitions and attach partition values if needed. + val scanBuilder = { + (columns: Seq[Attribute], filters: Array[Filter]) => { + // Builds RDD[Row]s for each selected partition. + val perPartitionRows = partitions.map { case Partition(partitionValues, dir) => + val partitionColNames = partitionColumns.fieldNames + + // Don't scan any partition columns to save I/O. Here we are being optimistic and + // assuming partition columns data stored in data files are always consistent with those + // partition values encoded in partition directory paths. + val needed = columns.filterNot(a => partitionColNames.contains(a.name)) + val dataRows = + relation.buildScan(needed.map(_.name).toArray, filters, Array(dir), confBroadcast) + + // Merges data values with partition values. + mergeWithPartitionValues( + relation.schema, + columns.map(_.name).toArray, + partitionColNames, + partitionValues, + toCatalystRDD(logicalRelation, needed, dataRows)) + } + + val unionedRows = + if (perPartitionRows.length == 0) { + relation.sqlContext.emptyResult + } else { + new UnionRDD(relation.sqlContext.sparkContext, perPartitionRows) + } - val unionedRows = - if (perPartitionRows.length == 0) { - relation.sqlContext.emptyResult - } else { - new UnionRDD(relation.sqlContext.sparkContext, perPartitionRows) + unionedRows } + } + + // Create the scan operator. If needed, add Filter and/or Project on top of the scan. + // The added Filter/Project is on top of the unioned RDD. We do not want to create + // one Filter/Project for every partition. + val sparkPlan = pruneFilterProject( + logicalRelation, + projections, + filters, + scanBuilder) - execution.PhysicalRDD(projections.map(_.toAttribute), unionedRows) + sparkPlan } // TODO: refactor this thing. It is very complicated because it does projection internally. @@ -187,15 +195,17 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { // To see whether the `index`-th column is a partition column... val i = partitionColumns.indexOf(name) if (i != -1) { + val dt = schema(partitionColumns(i)).dataType // If yes, gets column value from partition values. (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => { - mutableRow(ordinal) = partitionValues.genericGet(i) + mutableRow(ordinal) = partitionValues.get(i, dt) } } else { // Otherwise, inherits the value from scanned data. val i = nonPartitionColumns.indexOf(name) + val dt = schema(nonPartitionColumns(i)).dataType (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => { - mutableRow(ordinal) = dataRow.genericGet(i) + mutableRow(ordinal) = dataRow.get(i, dt) } } } @@ -295,14 +305,18 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { projects.asInstanceOf[Seq[Attribute]] // Safe due to if above. .map(relation.attributeMap) // Match original case of attributes. - val scan = execution.PhysicalRDD(projects.map(_.toAttribute), - scanBuilder(requestedColumns, pushedFilters)) + val scan = execution.PhysicalRDD.createFromDataSource( + projects.map(_.toAttribute), + scanBuilder(requestedColumns, pushedFilters), + relation.relation) filterCondition.map(execution.Filter(_, scan)).getOrElse(scan) } else { val requestedColumns = (projectSet ++ filterSet).map(relation.attributeMap).toSeq - val scan = execution.PhysicalRDD(requestedColumns, - scanBuilder(requestedColumns, pushedFilters)) + val scan = execution.PhysicalRDD.createFromDataSource( + requestedColumns, + scanBuilder(requestedColumns, pushedFilters), + relation.relation) execution.Project(projects, filterCondition.map(execution.Filter(_, scan)).getOrElse(scan)) } } @@ -334,33 +348,47 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { */ protected[sql] def selectFilters(filters: Seq[Expression]) = { def translate(predicate: Expression): Option[Filter] = predicate match { - case expressions.EqualTo(a: Attribute, Literal(v, _)) => - Some(sources.EqualTo(a.name, v)) - case expressions.EqualTo(Literal(v, _), a: Attribute) => - Some(sources.EqualTo(a.name, v)) - - case expressions.GreaterThan(a: Attribute, Literal(v, _)) => - Some(sources.GreaterThan(a.name, v)) - case expressions.GreaterThan(Literal(v, _), a: Attribute) => - Some(sources.LessThan(a.name, v)) - - case expressions.LessThan(a: Attribute, Literal(v, _)) => - Some(sources.LessThan(a.name, v)) - case expressions.LessThan(Literal(v, _), a: Attribute) => - Some(sources.GreaterThan(a.name, v)) - - case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, _)) => - Some(sources.GreaterThanOrEqual(a.name, v)) - case expressions.GreaterThanOrEqual(Literal(v, _), a: Attribute) => - Some(sources.LessThanOrEqual(a.name, v)) - - case expressions.LessThanOrEqual(a: Attribute, Literal(v, _)) => - Some(sources.LessThanOrEqual(a.name, v)) - case expressions.LessThanOrEqual(Literal(v, _), a: Attribute) => - Some(sources.GreaterThanOrEqual(a.name, v)) + case expressions.EqualTo(a: Attribute, Literal(v, t)) => + Some(sources.EqualTo(a.name, convertToScala(v, t))) + case expressions.EqualTo(Literal(v, t), a: Attribute) => + Some(sources.EqualTo(a.name, convertToScala(v, t))) + + case expressions.EqualNullSafe(a: Attribute, Literal(v, t)) => + Some(sources.EqualNullSafe(a.name, convertToScala(v, t))) + case expressions.EqualNullSafe(Literal(v, t), a: Attribute) => + Some(sources.EqualNullSafe(a.name, convertToScala(v, t))) + + case expressions.GreaterThan(a: Attribute, Literal(v, t)) => + Some(sources.GreaterThan(a.name, convertToScala(v, t))) + case expressions.GreaterThan(Literal(v, t), a: Attribute) => + Some(sources.LessThan(a.name, convertToScala(v, t))) + + case expressions.LessThan(a: Attribute, Literal(v, t)) => + Some(sources.LessThan(a.name, convertToScala(v, t))) + case expressions.LessThan(Literal(v, t), a: Attribute) => + Some(sources.GreaterThan(a.name, convertToScala(v, t))) + + case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, t)) => + Some(sources.GreaterThanOrEqual(a.name, convertToScala(v, t))) + case expressions.GreaterThanOrEqual(Literal(v, t), a: Attribute) => + Some(sources.LessThanOrEqual(a.name, convertToScala(v, t))) + + case expressions.LessThanOrEqual(a: Attribute, Literal(v, t)) => + Some(sources.LessThanOrEqual(a.name, convertToScala(v, t))) + case expressions.LessThanOrEqual(Literal(v, t), a: Attribute) => + Some(sources.GreaterThanOrEqual(a.name, convertToScala(v, t))) case expressions.InSet(a: Attribute, set) => - Some(sources.In(a.name, set.toArray)) + val toScala = CatalystTypeConverters.createToScalaConverter(a.dataType) + Some(sources.In(a.name, set.toArray.map(toScala))) + + // Because we only convert In to InSet in Optimizer when there are more than certain + // items. So it is possible we still get an In expression here that needs to be pushed + // down. + case expressions.In(a: Attribute, list) if !list.exists(!_.isInstanceOf[Literal]) => + val hSet = list.map(e => e.eval(EmptyRow)) + val toScala = CatalystTypeConverters.createToScalaConverter(a.dataType) + Some(sources.In(a.name, hSet.toArray.map(toScala))) case expressions.IsNull(a: Attribute) => Some(sources.IsNull(a.name)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DefaultSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DefaultSource.scala new file mode 100644 index 000000000000..6e4cc4de7f65 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DefaultSource.scala @@ -0,0 +1,64 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql.execution.datasources + +import java.util.Properties + +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.execution.datasources.jdbc.{JDBCRelation, JDBCPartitioningInfo, DriverRegistry} +import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, RelationProvider} + + +class DefaultSource extends RelationProvider with DataSourceRegister { + + override def shortName(): String = "jdbc" + + /** Returns a new base relation with the given parameters. */ + override def createRelation( + sqlContext: SQLContext, + parameters: Map[String, String]): BaseRelation = { + val url = parameters.getOrElse("url", sys.error("Option 'url' not specified")) + val driver = parameters.getOrElse("driver", null) + val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified")) + val partitionColumn = parameters.getOrElse("partitionColumn", null) + val lowerBound = parameters.getOrElse("lowerBound", null) + val upperBound = parameters.getOrElse("upperBound", null) + val numPartitions = parameters.getOrElse("numPartitions", null) + + if (driver != null) DriverRegistry.register(driver) + + if (partitionColumn != null + && (lowerBound == null || upperBound == null || numPartitions == null)) { + sys.error("Partitioning incompletely specified") + } + + val partitionInfo = if (partitionColumn == null) { + null + } else { + JDBCPartitioningInfo( + partitionColumn, + lowerBound.toLong, + upperBound.toLong, + numPartitions.toInt) + } + val parts = JDBCRelation.columnPartition(partitionInfo) + val properties = new Properties() // Additional properties that we will pass to getConnection + parameters.foreach(kv => properties.setProperty(kv._1, kv._2)) + JDBCRelation(url, table, parts, properties)(sqlContext) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSource.scala new file mode 100644 index 000000000000..3b7dc2e8d021 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSource.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.RunnableCommand +import org.apache.spark.sql.sources.InsertableRelation + + +/** + * Inserts the results of `query` in to a relation that extends [[InsertableRelation]]. + */ +private[sql] case class InsertIntoDataSource( + logicalRelation: LogicalRelation, + query: LogicalPlan, + overwrite: Boolean) + extends RunnableCommand { + + override def run(sqlContext: SQLContext): Seq[Row] = { + val relation = logicalRelation.relation.asInstanceOf[InsertableRelation] + val data = DataFrame(sqlContext, query) + // Apply the schema of the existing table to the new data. + val df = sqlContext.internalCreateDataFrame(data.queryExecution.toRdd, logicalRelation.schema) + relation.insert(df, overwrite) + + // Invalidate the cache. + sqlContext.cacheManager.invalidateCache(logicalRelation) + + Seq.empty[Row] + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala new file mode 100644 index 000000000000..735d52f80886 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.IOException + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce._ +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat +import org.apache.spark._ +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.{RunnableCommand, SQLExecution} +import org.apache.spark.sql.sources._ +import org.apache.spark.util.Utils + + +/** + * A command for writing data to a [[HadoopFsRelation]]. Supports both overwriting and appending. + * Writing to dynamic partitions is also supported. Each [[InsertIntoHadoopFsRelation]] issues a + * single write job, and owns a UUID that identifies this job. Each concrete implementation of + * [[HadoopFsRelation]] should use this UUID together with task id to generate unique file path for + * each task output file. This UUID is passed to executor side via a property named + * `spark.sql.sources.writeJobUUID`. + * + * Different writer containers, [[DefaultWriterContainer]] and [[DynamicPartitionWriterContainer]] + * are used to write to normal tables and tables with dynamic partitions. + * + * Basic work flow of this command is: + * + * 1. Driver side setup, including output committer initialization and data source specific + * preparation work for the write job to be issued. + * 2. Issues a write job consists of one or more executor side tasks, each of which writes all + * rows within an RDD partition. + * 3. If no exception is thrown in a task, commits that task, otherwise aborts that task; If any + * exception is thrown during task commitment, also aborts that task. + * 4. If all tasks are committed, commit the job, otherwise aborts the job; If any exception is + * thrown during job commitment, also aborts the job. + */ +private[sql] case class InsertIntoHadoopFsRelation( + @transient relation: HadoopFsRelation, + @transient query: LogicalPlan, + mode: SaveMode) + extends RunnableCommand { + + override def run(sqlContext: SQLContext): Seq[Row] = { + require( + relation.paths.length == 1, + s"Cannot write to multiple destinations: ${relation.paths.mkString(",")}") + + val hadoopConf = sqlContext.sparkContext.hadoopConfiguration + val outputPath = new Path(relation.paths.head) + val fs = outputPath.getFileSystem(hadoopConf) + val qualifiedOutputPath = outputPath.makeQualified(fs.getUri, fs.getWorkingDirectory) + + val pathExists = fs.exists(qualifiedOutputPath) + val doInsertion = (mode, pathExists) match { + case (SaveMode.ErrorIfExists, true) => + throw new AnalysisException(s"path $qualifiedOutputPath already exists.") + case (SaveMode.Overwrite, true) => + Utils.tryOrIOException { + if (!fs.delete(qualifiedOutputPath, true /* recursively */)) { + throw new IOException(s"Unable to clear output " + + s"directory $qualifiedOutputPath prior to writing to it") + } + } + true + case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) => + true + case (SaveMode.Ignore, exists) => + !exists + case (s, exists) => + throw new IllegalStateException(s"unsupported save mode $s ($exists)") + } + // If we are appending data to an existing dir. + val isAppend = pathExists && (mode == SaveMode.Append) + + if (doInsertion) { + val job = new Job(hadoopConf) + job.setOutputKeyClass(classOf[Void]) + job.setOutputValueClass(classOf[InternalRow]) + FileOutputFormat.setOutputPath(job, qualifiedOutputPath) + + // A partitioned relation schema's can be different from the input logicalPlan, since + // partition columns are all moved after data column. We Project to adjust the ordering. + // TODO: this belongs in the analyzer. + val project = Project( + relation.schema.map(field => UnresolvedAttribute.quoted(field.name)), query) + val queryExecution = DataFrame(sqlContext, project).queryExecution + + SQLExecution.withNewExecutionId(sqlContext, queryExecution) { + val df = sqlContext.internalCreateDataFrame(queryExecution.toRdd, relation.schema) + val partitionColumns = relation.partitionColumns.fieldNames + + // Some pre-flight checks. + require( + df.schema == relation.schema, + s"""DataFrame must have the same schema as the relation to which is inserted. + |DataFrame schema: ${df.schema} + |Relation schema: ${relation.schema} + """.stripMargin) + val partitionColumnsInSpec = relation.partitionColumns.fieldNames + require( + partitionColumnsInSpec.sameElements(partitionColumns), + s"""Partition columns mismatch. + |Expected: ${partitionColumnsInSpec.mkString(", ")} + |Actual: ${partitionColumns.mkString(", ")} + """.stripMargin) + + val writerContainer = if (partitionColumns.isEmpty) { + new DefaultWriterContainer(relation, job, isAppend) + } else { + val output = df.queryExecution.executedPlan.output + val (partitionOutput, dataOutput) = + output.partition(a => partitionColumns.contains(a.name)) + + new DynamicPartitionWriterContainer( + relation, + job, + partitionOutput, + dataOutput, + output, + PartitioningUtils.DEFAULT_PARTITION_NAME, + sqlContext.conf.getConf(SQLConf.PARTITION_MAX_FILES), + isAppend) + } + + // This call shouldn't be put into the `try` block below because it only initializes and + // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called. + writerContainer.driverSideSetup() + + try { + sqlContext.sparkContext.runJob(df.queryExecution.toRdd, writerContainer.writeRows _) + writerContainer.commitJob() + relation.refresh() + } catch { case cause: Throwable => + logError("Aborting job.", cause) + writerContainer.abortJob() + throw new SparkException("Job aborted.", cause) + } + } + } else { + logInfo("Skipping insertion into a relation that already exists.") + } + + Seq.empty[Row] + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala new file mode 100644 index 000000000000..7770bbd712f0 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala @@ -0,0 +1,204 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql.execution.datasources + +import java.util.ServiceLoader + +import scala.collection.JavaConversions._ +import scala.language.{existentials, implicitConversions} +import scala.util.{Success, Failure, Try} + +import org.apache.hadoop.fs.Path + +import org.apache.spark.Logging +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.sql.{DataFrame, SaveMode, AnalysisException, SQLContext} +import org.apache.spark.sql.sources._ +import org.apache.spark.sql.types.{CalendarIntervalType, StructType} +import org.apache.spark.util.Utils + + +case class ResolvedDataSource(provider: Class[_], relation: BaseRelation) + + +object ResolvedDataSource extends Logging { + + /** A map to maintain backward compatibility in case we move data sources around. */ + private val backwardCompatibilityMap = Map( + "org.apache.spark.sql.jdbc" -> classOf[jdbc.DefaultSource].getCanonicalName, + "org.apache.spark.sql.jdbc.DefaultSource" -> classOf[jdbc.DefaultSource].getCanonicalName, + "org.apache.spark.sql.json" -> classOf[json.DefaultSource].getCanonicalName, + "org.apache.spark.sql.json.DefaultSource" -> classOf[json.DefaultSource].getCanonicalName, + "org.apache.spark.sql.parquet" -> classOf[parquet.DefaultSource].getCanonicalName, + "org.apache.spark.sql.parquet.DefaultSource" -> classOf[parquet.DefaultSource].getCanonicalName + ) + + /** Given a provider name, look up the data source class definition. */ + def lookupDataSource(provider0: String): Class[_] = { + val provider = backwardCompatibilityMap.getOrElse(provider0, provider0) + val provider2 = s"$provider.DefaultSource" + val loader = Utils.getContextOrSparkClassLoader + val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader) + + serviceLoader.iterator().filter(_.shortName().equalsIgnoreCase(provider)).toList match { + /** the provider format did not match any given registered aliases */ + case Nil => Try(loader.loadClass(provider)).orElse(Try(loader.loadClass(provider2))) match { + case Success(dataSource) => dataSource + case Failure(error) => + if (provider.startsWith("org.apache.spark.sql.hive.orc")) { + throw new ClassNotFoundException( + "The ORC data source must be used with Hive support enabled.", error) + } else { + throw new ClassNotFoundException( + s"Failed to load class for data source: $provider.", error) + } + } + /** there is exactly one registered alias */ + case head :: Nil => head.getClass + /** There are multiple registered aliases for the input */ + case sources => sys.error(s"Multiple sources found for $provider, " + + s"(${sources.map(_.getClass.getName).mkString(", ")}), " + + "please specify the fully qualified class name.") + } + } + + /** Create a [[ResolvedDataSource]] for reading data in. */ + def apply( + sqlContext: SQLContext, + userSpecifiedSchema: Option[StructType], + partitionColumns: Array[String], + provider: String, + options: Map[String, String]): ResolvedDataSource = { + val clazz: Class[_] = lookupDataSource(provider) + def className: String = clazz.getCanonicalName + val relation = userSpecifiedSchema match { + case Some(schema: StructType) => clazz.newInstance() match { + case dataSource: SchemaRelationProvider => + dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options), schema) + case dataSource: HadoopFsRelationProvider => + val maybePartitionsSchema = if (partitionColumns.isEmpty) { + None + } else { + Some(partitionColumnsSchema(schema, partitionColumns)) + } + + val caseInsensitiveOptions = new CaseInsensitiveMap(options) + val paths = { + val patternPath = new Path(caseInsensitiveOptions("path")) + val fs = patternPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) + val qualifiedPattern = patternPath.makeQualified(fs.getUri, fs.getWorkingDirectory) + SparkHadoopUtil.get.globPathIfNecessary(qualifiedPattern).map(_.toString).toArray + } + + val dataSchema = + StructType(schema.filterNot(f => partitionColumns.contains(f.name))).asNullable + + dataSource.createRelation( + sqlContext, + paths, + Some(dataSchema), + maybePartitionsSchema, + caseInsensitiveOptions) + case dataSource: org.apache.spark.sql.sources.RelationProvider => + throw new AnalysisException(s"$className does not allow user-specified schemas.") + case _ => + throw new AnalysisException(s"$className is not a RelationProvider.") + } + + case None => clazz.newInstance() match { + case dataSource: RelationProvider => + dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options)) + case dataSource: HadoopFsRelationProvider => + val caseInsensitiveOptions = new CaseInsensitiveMap(options) + val paths = { + val patternPath = new Path(caseInsensitiveOptions("path")) + val fs = patternPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) + val qualifiedPattern = patternPath.makeQualified(fs.getUri, fs.getWorkingDirectory) + SparkHadoopUtil.get.globPathIfNecessary(qualifiedPattern).map(_.toString).toArray + } + dataSource.createRelation(sqlContext, paths, None, None, caseInsensitiveOptions) + case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider => + throw new AnalysisException( + s"A schema needs to be specified when using $className.") + case _ => + throw new AnalysisException( + s"$className is neither a RelationProvider nor a FSBasedRelationProvider.") + } + } + new ResolvedDataSource(clazz, relation) + } + + private def partitionColumnsSchema( + schema: StructType, + partitionColumns: Array[String]): StructType = { + StructType(partitionColumns.map { col => + schema.find(_.name == col).getOrElse { + throw new RuntimeException(s"Partition column $col not found in schema $schema") + } + }).asNullable + } + + /** Create a [[ResolvedDataSource]] for saving the content of the given DataFrame. */ + def apply( + sqlContext: SQLContext, + provider: String, + partitionColumns: Array[String], + mode: SaveMode, + options: Map[String, String], + data: DataFrame): ResolvedDataSource = { + if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) { + throw new AnalysisException("Cannot save interval data type into external storage.") + } + val clazz: Class[_] = lookupDataSource(provider) + val relation = clazz.newInstance() match { + case dataSource: CreatableRelationProvider => + dataSource.createRelation(sqlContext, mode, options, data) + case dataSource: HadoopFsRelationProvider => + // Don't glob path for the write path. The contracts here are: + // 1. Only one output path can be specified on the write path; + // 2. Output path must be a legal HDFS style file system path; + // 3. It's OK that the output path doesn't exist yet; + val caseInsensitiveOptions = new CaseInsensitiveMap(options) + val outputPath = { + val path = new Path(caseInsensitiveOptions("path")) + val fs = path.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) + path.makeQualified(fs.getUri, fs.getWorkingDirectory) + } + val dataSchema = StructType(data.schema.filterNot(f => partitionColumns.contains(f.name))) + val r = dataSource.createRelation( + sqlContext, + Array(outputPath.toString), + Some(dataSchema.asNullable), + Some(partitionColumnsSchema(data.schema, partitionColumns)), + caseInsensitiveOptions) + + // For partitioned relation r, r.schema's column ordering can be different from the column + // ordering of data.logicalPlan (partition columns are all moved after data column). This + // will be adjusted within InsertIntoHadoopFsRelation. + sqlContext.executePlan( + InsertIntoHadoopFsRelation( + r, + data.logicalPlan, + mode)).toRdd + r + case _ => + sys.error(s"${clazz.getCanonicalName} does not allow create table as select.") + } + ResolvedDataSource(clazz, relation) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala new file mode 100644 index 000000000000..d454144d4246 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala @@ -0,0 +1,435 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.util.{Date, UUID} + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce._ +import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter} +import org.apache.spark._ +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.mapred.SparkHadoopMapRedUtil +import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.UnsafeKVExternalSorter +import org.apache.spark.sql.sources.{HadoopFsRelation, OutputWriter, OutputWriterFactory} +import org.apache.spark.sql.types.{StructType, StringType} +import org.apache.spark.util.SerializableConfiguration + + +private[sql] abstract class BaseWriterContainer( + @transient val relation: HadoopFsRelation, + @transient job: Job, + isAppend: Boolean) + extends SparkHadoopMapReduceUtil + with Logging + with Serializable { + + protected val dataSchema = relation.dataSchema + + protected val serializableConf = new SerializableConfiguration(job.getConfiguration) + + // This UUID is used to avoid output file name collision between different appending write jobs. + // These jobs may belong to different SparkContext instances. Concrete data source implementations + // may use this UUID to generate unique file names (e.g., `part-r--.parquet`). + // The reason why this ID is used to identify a job rather than a single task output file is + // that, speculative tasks must generate the same output file name as the original task. + private val uniqueWriteJobId = UUID.randomUUID() + + // This is only used on driver side. + @transient private val jobContext: JobContext = job + + private val speculationEnabled: Boolean = + relation.sqlContext.sparkContext.conf.getBoolean("spark.speculation", defaultValue = false) + + // The following fields are initialized and used on both driver and executor side. + @transient protected var outputCommitter: OutputCommitter = _ + @transient private var jobId: JobID = _ + @transient private var taskId: TaskID = _ + @transient private var taskAttemptId: TaskAttemptID = _ + @transient protected var taskAttemptContext: TaskAttemptContext = _ + + protected val outputPath: String = { + assert( + relation.paths.length == 1, + s"Cannot write to multiple destinations: ${relation.paths.mkString(",")}") + relation.paths.head + } + + protected var outputWriterFactory: OutputWriterFactory = _ + + private var outputFormatClass: Class[_ <: OutputFormat[_, _]] = _ + + def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit + + def driverSideSetup(): Unit = { + setupIDs(0, 0, 0) + setupConf() + + // This UUID is sent to executor side together with the serialized `Configuration` object within + // the `Job` instance. `OutputWriters` on the executor side should use this UUID to generate + // unique task output files. + job.getConfiguration.set("spark.sql.sources.writeJobUUID", uniqueWriteJobId.toString) + + // Order of the following two lines is important. For Hadoop 1, TaskAttemptContext constructor + // clones the Configuration object passed in. If we initialize the TaskAttemptContext first, + // configurations made in prepareJobForWrite(job) are not populated into the TaskAttemptContext. + // + // Also, the `prepareJobForWrite` call must happen before initializing output format and output + // committer, since their initialization involve the job configuration, which can be potentially + // decorated in `prepareJobForWrite`. + outputWriterFactory = relation.prepareJobForWrite(job) + taskAttemptContext = newTaskAttemptContext(serializableConf.value, taskAttemptId) + + outputFormatClass = job.getOutputFormatClass + outputCommitter = newOutputCommitter(taskAttemptContext) + outputCommitter.setupJob(jobContext) + } + + def executorSideSetup(taskContext: TaskContext): Unit = { + setupIDs(taskContext.stageId(), taskContext.partitionId(), taskContext.attemptNumber()) + setupConf() + taskAttemptContext = newTaskAttemptContext(serializableConf.value, taskAttemptId) + outputCommitter = newOutputCommitter(taskAttemptContext) + outputCommitter.setupTask(taskAttemptContext) + } + + protected def getWorkPath: String = { + outputCommitter match { + // FileOutputCommitter writes to a temporary location returned by `getWorkPath`. + case f: MapReduceFileOutputCommitter => f.getWorkPath.toString + case _ => outputPath + } + } + + private def newOutputCommitter(context: TaskAttemptContext): OutputCommitter = { + val defaultOutputCommitter = outputFormatClass.newInstance().getOutputCommitter(context) + + if (isAppend) { + // If we are appending data to an existing dir, we will only use the output committer + // associated with the file output format since it is not safe to use a custom + // committer for appending. For example, in S3, direct parquet output committer may + // leave partial data in the destination dir when the the appending job fails. + // + // See SPARK-8578 for more details + logInfo( + s"Using default output committer ${defaultOutputCommitter.getClass.getCanonicalName} " + + "for appending.") + defaultOutputCommitter + } else if (speculationEnabled) { + // When speculation is enabled, it's not safe to use customized output committer classes, + // especially direct output committers (e.g. `DirectParquetOutputCommitter`). + // + // See SPARK-9899 for more details. + logInfo( + s"Using default output committer ${defaultOutputCommitter.getClass.getCanonicalName} " + + "because spark.speculation is configured to be true.") + defaultOutputCommitter + } else { + val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context) + val committerClass = configuration.getClass( + SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter]) + + Option(committerClass).map { clazz => + logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}") + + // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat + // has an associated output committer. To override this output committer, + // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS. + // If a data source needs to override the output committer, it needs to set the + // output committer in prepareForWrite method. + if (classOf[MapReduceFileOutputCommitter].isAssignableFrom(clazz)) { + // The specified output committer is a FileOutputCommitter. + // So, we will use the FileOutputCommitter-specified constructor. + val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) + ctor.newInstance(new Path(outputPath), context) + } else { + // The specified output committer is just a OutputCommitter. + // So, we will use the no-argument constructor. + val ctor = clazz.getDeclaredConstructor() + ctor.newInstance() + } + }.getOrElse { + // If output committer class is not set, we will use the one associated with the + // file output format. + logInfo( + s"Using output committer class ${defaultOutputCommitter.getClass.getCanonicalName}") + defaultOutputCommitter + } + } + } + + private def setupIDs(jobId: Int, splitId: Int, attemptId: Int): Unit = { + this.jobId = SparkHadoopWriter.createJobID(new Date, jobId) + this.taskId = new TaskID(this.jobId, true, splitId) + this.taskAttemptId = new TaskAttemptID(taskId, attemptId) + } + + private def setupConf(): Unit = { + serializableConf.value.set("mapred.job.id", jobId.toString) + serializableConf.value.set("mapred.tip.id", taskAttemptId.getTaskID.toString) + serializableConf.value.set("mapred.task.id", taskAttemptId.toString) + serializableConf.value.setBoolean("mapred.task.is.map", true) + serializableConf.value.setInt("mapred.task.partition", 0) + } + + def commitTask(): Unit = { + SparkHadoopMapRedUtil.commitTask( + outputCommitter, taskAttemptContext, jobId.getId, taskId.getId, taskAttemptId.getId) + } + + def abortTask(): Unit = { + if (outputCommitter != null) { + outputCommitter.abortTask(taskAttemptContext) + } + logError(s"Task attempt $taskAttemptId aborted.") + } + + def commitJob(): Unit = { + outputCommitter.commitJob(jobContext) + logInfo(s"Job $jobId committed.") + } + + def abortJob(): Unit = { + if (outputCommitter != null) { + outputCommitter.abortJob(jobContext, JobStatus.State.FAILED) + } + logError(s"Job $jobId aborted.") + } +} + +/** + * A writer that writes all of the rows in a partition to a single file. + */ +private[sql] class DefaultWriterContainer( + @transient relation: HadoopFsRelation, + @transient job: Job, + isAppend: Boolean) + extends BaseWriterContainer(relation, job, isAppend) { + + def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = { + executorSideSetup(taskContext) + val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(taskAttemptContext) + configuration.set("spark.sql.sources.output.path", outputPath) + val writer = outputWriterFactory.newInstance(getWorkPath, dataSchema, taskAttemptContext) + writer.initConverter(dataSchema) + + var writerClosed = false + + // If anything below fails, we should abort the task. + try { + while (iterator.hasNext) { + val internalRow = iterator.next() + writer.writeInternal(internalRow) + } + + commitTask() + } catch { + case cause: Throwable => + logError("Aborting task.", cause) + abortTask() + throw new SparkException("Task failed while writing rows.", cause) + } + + def commitTask(): Unit = { + try { + assert(writer != null, "OutputWriter instance should have been initialized") + if (!writerClosed) { + writer.close() + writerClosed = true + } + super.commitTask() + } catch { + case cause: Throwable => + // This exception will be handled in `InsertIntoHadoopFsRelation.insert$writeRows`, and + // will cause `abortTask()` to be invoked. + throw new RuntimeException("Failed to commit task", cause) + } + } + + def abortTask(): Unit = { + try { + if (!writerClosed) { + writer.close() + writerClosed = true + } + } finally { + super.abortTask() + } + } + } +} + +/** + * A writer that dynamically opens files based on the given partition columns. Internally this is + * done by maintaining a HashMap of open files until `maxFiles` is reached. If this occurs, the + * writer externally sorts the remaining rows and then writes out them out one file at a time. + */ +private[sql] class DynamicPartitionWriterContainer( + @transient relation: HadoopFsRelation, + @transient job: Job, + partitionColumns: Seq[Attribute], + dataColumns: Seq[Attribute], + inputSchema: Seq[Attribute], + defaultPartitionName: String, + maxOpenFiles: Int, + isAppend: Boolean) + extends BaseWriterContainer(relation, job, isAppend) { + + def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = { + val outputWriters = new java.util.HashMap[InternalRow, OutputWriter] + executorSideSetup(taskContext) + + var outputWritersCleared = false + + // Returns the partition key given an input row + val getPartitionKey = UnsafeProjection.create(partitionColumns, inputSchema) + // Returns the data columns to be written given an input row + val getOutputRow = UnsafeProjection.create(dataColumns, inputSchema) + + // Expressions that given a partition key build a string like: col1=val/col2=val/... + val partitionStringExpression = partitionColumns.zipWithIndex.flatMap { case (c, i) => + val escaped = + ScalaUDF( + PartitioningUtils.escapePathName _, StringType, Seq(Cast(c, StringType)), Seq(StringType)) + val str = If(IsNull(c), Literal(defaultPartitionName), escaped) + val partitionName = Literal(c.name + "=") :: str :: Nil + if (i == 0) partitionName else Literal(Path.SEPARATOR_CHAR.toString) :: partitionName + } + + // Returns the partition path given a partition key. + val getPartitionString = + UnsafeProjection.create(Concat(partitionStringExpression) :: Nil, partitionColumns) + + // If anything below fails, we should abort the task. + try { + // This will be filled in if we have to fall back on sorting. + var sorter: UnsafeKVExternalSorter = null + while (iterator.hasNext && sorter == null) { + val inputRow = iterator.next() + val currentKey = getPartitionKey(inputRow) + var currentWriter = outputWriters.get(currentKey) + + if (currentWriter == null) { + if (outputWriters.size < maxOpenFiles) { + currentWriter = newOutputWriter(currentKey) + outputWriters.put(currentKey.copy(), currentWriter) + currentWriter.writeInternal(getOutputRow(inputRow)) + } else { + logInfo(s"Maximum partitions reached, falling back on sorting.") + sorter = new UnsafeKVExternalSorter( + StructType.fromAttributes(partitionColumns), + StructType.fromAttributes(dataColumns), + SparkEnv.get.blockManager, + SparkEnv.get.shuffleMemoryManager, + SparkEnv.get.shuffleMemoryManager.pageSizeBytes) + sorter.insertKV(currentKey, getOutputRow(inputRow)) + } + } else { + currentWriter.writeInternal(getOutputRow(inputRow)) + } + } + + // If the sorter is not null that means that we reached the maxFiles above and need to finish + // using external sort. + if (sorter != null) { + while (iterator.hasNext) { + val currentRow = iterator.next() + sorter.insertKV(getPartitionKey(currentRow), getOutputRow(currentRow)) + } + + logInfo(s"Sorting complete. Writing out partition files one at a time.") + + val sortedIterator = sorter.sortedIterator() + var currentKey: InternalRow = null + var currentWriter: OutputWriter = null + try { + while (sortedIterator.next()) { + if (currentKey != sortedIterator.getKey) { + if (currentWriter != null) { + currentWriter.close() + } + currentKey = sortedIterator.getKey.copy() + logDebug(s"Writing partition: $currentKey") + + // Either use an existing file from before, or open a new one. + currentWriter = outputWriters.remove(currentKey) + if (currentWriter == null) { + currentWriter = newOutputWriter(currentKey) + } + } + + currentWriter.writeInternal(sortedIterator.getValue) + } + } finally { + if (currentWriter != null) { currentWriter.close() } + } + } + + commitTask() + } catch { + case cause: Throwable => + logError("Aborting task.", cause) + abortTask() + throw new SparkException("Task failed while writing rows.", cause) + } + + /** Open and returns a new OutputWriter given a partition key. */ + def newOutputWriter(key: InternalRow): OutputWriter = { + val partitionPath = getPartitionString(key).getString(0) + val path = new Path(getWorkPath, partitionPath) + val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(taskAttemptContext) + configuration.set( + "spark.sql.sources.output.path", new Path(outputPath, partitionPath).toString) + val newWriter = outputWriterFactory.newInstance(path.toString, dataSchema, taskAttemptContext) + newWriter.initConverter(dataSchema) + newWriter + } + + def clearOutputWriters(): Unit = { + if (!outputWritersCleared) { + outputWriters.asScala.values.foreach(_.close()) + outputWriters.clear() + outputWritersCleared = true + } + } + + def commitTask(): Unit = { + try { + clearOutputWriters() + super.commitTask() + } catch { + case cause: Throwable => + throw new RuntimeException("Failed to commit task", cause) + } + } + + def abortTask(): Unit = { + try { + clearOutputWriters() + } finally { + super.abortTask() + } + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala deleted file mode 100644 index d551f386eee6..000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala +++ /dev/null @@ -1,599 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.datasources - -import java.util.{Date, UUID} - -import scala.collection.JavaConversions.asScalaIterator - -import org.apache.hadoop.fs.Path -import org.apache.hadoop.mapreduce._ -import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter, FileOutputFormat} -import org.apache.spark._ -import org.apache.spark.mapred.SparkHadoopMapRedUtil -import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil -import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} -import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} -import org.apache.spark.sql.execution.RunnableCommand -import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types.StringType -import org.apache.spark.util.SerializableConfiguration - - -private[sql] case class InsertIntoDataSource( - logicalRelation: LogicalRelation, - query: LogicalPlan, - overwrite: Boolean) - extends RunnableCommand { - - override def run(sqlContext: SQLContext): Seq[Row] = { - val relation = logicalRelation.relation.asInstanceOf[InsertableRelation] - val data = DataFrame(sqlContext, query) - // Apply the schema of the existing table to the new data. - val df = sqlContext.internalCreateDataFrame(data.queryExecution.toRdd, logicalRelation.schema) - relation.insert(df, overwrite) - - // Invalidate the cache. - sqlContext.cacheManager.invalidateCache(logicalRelation) - - Seq.empty[Row] - } -} - -/** - * A command for writing data to a [[HadoopFsRelation]]. Supports both overwriting and appending. - * Writing to dynamic partitions is also supported. Each [[InsertIntoHadoopFsRelation]] issues a - * single write job, and owns a UUID that identifies this job. Each concrete implementation of - * [[HadoopFsRelation]] should use this UUID together with task id to generate unique file path for - * each task output file. This UUID is passed to executor side via a property named - * `spark.sql.sources.writeJobUUID`. - * - * Different writer containers, [[DefaultWriterContainer]] and [[DynamicPartitionWriterContainer]] - * are used to write to normal tables and tables with dynamic partitions. - * - * Basic work flow of this command is: - * - * 1. Driver side setup, including output committer initialization and data source specific - * preparation work for the write job to be issued. - * 2. Issues a write job consists of one or more executor side tasks, each of which writes all - * rows within an RDD partition. - * 3. If no exception is thrown in a task, commits that task, otherwise aborts that task; If any - * exception is thrown during task commitment, also aborts that task. - * 4. If all tasks are committed, commit the job, otherwise aborts the job; If any exception is - * thrown during job commitment, also aborts the job. - */ -private[sql] case class InsertIntoHadoopFsRelation( - @transient relation: HadoopFsRelation, - @transient query: LogicalPlan, - mode: SaveMode) - extends RunnableCommand { - - override def run(sqlContext: SQLContext): Seq[Row] = { - require( - relation.paths.length == 1, - s"Cannot write to multiple destinations: ${relation.paths.mkString(",")}") - - val hadoopConf = sqlContext.sparkContext.hadoopConfiguration - val outputPath = new Path(relation.paths.head) - val fs = outputPath.getFileSystem(hadoopConf) - val qualifiedOutputPath = outputPath.makeQualified(fs.getUri, fs.getWorkingDirectory) - - val pathExists = fs.exists(qualifiedOutputPath) - val doInsertion = (mode, pathExists) match { - case (SaveMode.ErrorIfExists, true) => - throw new AnalysisException(s"path $qualifiedOutputPath already exists.") - case (SaveMode.Overwrite, true) => - fs.delete(qualifiedOutputPath, true) - true - case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) => - true - case (SaveMode.Ignore, exists) => - !exists - case (s, exists) => - throw new IllegalStateException(s"unsupported save mode $s ($exists)") - } - // If we are appending data to an existing dir. - val isAppend = pathExists && (mode == SaveMode.Append) - - if (doInsertion) { - val job = new Job(hadoopConf) - job.setOutputKeyClass(classOf[Void]) - job.setOutputValueClass(classOf[InternalRow]) - FileOutputFormat.setOutputPath(job, qualifiedOutputPath) - - // We create a DataFrame by applying the schema of relation to the data to make sure. - // We are writing data based on the expected schema, - val df = { - // For partitioned relation r, r.schema's column ordering can be different from the column - // ordering of data.logicalPlan (partition columns are all moved after data column). We - // need a Project to adjust the ordering, so that inside InsertIntoHadoopFsRelation, we can - // safely apply the schema of r.schema to the data. - val project = Project( - relation.schema.map(field => new UnresolvedAttribute(Seq(field.name))), query) - - sqlContext.internalCreateDataFrame( - DataFrame(sqlContext, project).queryExecution.toRdd, relation.schema) - } - - val partitionColumns = relation.partitionColumns.fieldNames - if (partitionColumns.isEmpty) { - insert(new DefaultWriterContainer(relation, job, isAppend), df) - } else { - val writerContainer = new DynamicPartitionWriterContainer( - relation, job, partitionColumns, PartitioningUtils.DEFAULT_PARTITION_NAME, isAppend) - insertWithDynamicPartitions(sqlContext, writerContainer, df, partitionColumns) - } - } - - Seq.empty[Row] - } - - /** - * Inserts the content of the [[DataFrame]] into a table without any partitioning columns. - */ - private def insert(writerContainer: BaseWriterContainer, df: DataFrame): Unit = { - // Uses local vals for serialization - val needsConversion = relation.needConversion - val dataSchema = relation.dataSchema - - // This call shouldn't be put into the `try` block below because it only initializes and - // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called. - writerContainer.driverSideSetup() - - try { - df.sqlContext.sparkContext.runJob(df.queryExecution.toRdd, writeRows _) - writerContainer.commitJob() - relation.refresh() - } catch { case cause: Throwable => - logError("Aborting job.", cause) - writerContainer.abortJob() - throw new SparkException("Job aborted.", cause) - } - - def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = { - // If anything below fails, we should abort the task. - try { - writerContainer.executorSideSetup(taskContext) - - if (needsConversion) { - val converter = CatalystTypeConverters.createToScalaConverter(dataSchema) - .asInstanceOf[InternalRow => Row] - while (iterator.hasNext) { - val internalRow = iterator.next() - writerContainer.outputWriterForRow(internalRow).write(converter(internalRow)) - } - } else { - while (iterator.hasNext) { - val internalRow = iterator.next() - writerContainer.outputWriterForRow(internalRow) - .asInstanceOf[OutputWriterInternal].writeInternal(internalRow) - } - } - - writerContainer.commitTask() - } catch { case cause: Throwable => - logError("Aborting task.", cause) - writerContainer.abortTask() - throw new SparkException("Task failed while writing rows.", cause) - } - } - } - - /** - * Inserts the content of the [[DataFrame]] into a table with partitioning columns. - */ - private def insertWithDynamicPartitions( - sqlContext: SQLContext, - writerContainer: BaseWriterContainer, - df: DataFrame, - partitionColumns: Array[String]): Unit = { - // Uses a local val for serialization - val needsConversion = relation.needConversion - val dataSchema = relation.dataSchema - - require( - df.schema == relation.schema, - s"""DataFrame must have the same schema as the relation to which is inserted. - |DataFrame schema: ${df.schema} - |Relation schema: ${relation.schema} - """.stripMargin) - - val partitionColumnsInSpec = relation.partitionColumns.fieldNames - require( - partitionColumnsInSpec.sameElements(partitionColumns), - s"""Partition columns mismatch. - |Expected: ${partitionColumnsInSpec.mkString(", ")} - |Actual: ${partitionColumns.mkString(", ")} - """.stripMargin) - - val output = df.queryExecution.executedPlan.output - val (partitionOutput, dataOutput) = output.partition(a => partitionColumns.contains(a.name)) - val codegenEnabled = df.sqlContext.conf.codegenEnabled - - // This call shouldn't be put into the `try` block below because it only initializes and - // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called. - writerContainer.driverSideSetup() - - try { - df.sqlContext.sparkContext.runJob(df.queryExecution.toRdd, writeRows _) - writerContainer.commitJob() - relation.refresh() - } catch { case cause: Throwable => - logError("Aborting job.", cause) - writerContainer.abortJob() - throw new SparkException("Job aborted.", cause) - } - - def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = { - // If anything below fails, we should abort the task. - try { - writerContainer.executorSideSetup(taskContext) - - // Projects all partition columns and casts them to strings to build partition directories. - val partitionCasts = partitionOutput.map(Cast(_, StringType)) - val partitionProj = newProjection(codegenEnabled, partitionCasts, output) - val dataProj = newProjection(codegenEnabled, dataOutput, output) - - if (needsConversion) { - val converter = CatalystTypeConverters.createToScalaConverter(dataSchema) - .asInstanceOf[InternalRow => Row] - while (iterator.hasNext) { - val internalRow = iterator.next() - val partitionPart = partitionProj(internalRow) - val dataPart = converter(dataProj(internalRow)) - writerContainer.outputWriterForRow(partitionPart).write(dataPart) - } - } else { - while (iterator.hasNext) { - val internalRow = iterator.next() - val partitionPart = partitionProj(internalRow) - val dataPart = dataProj(internalRow) - writerContainer.outputWriterForRow(partitionPart) - .asInstanceOf[OutputWriterInternal].writeInternal(dataPart) - } - } - - writerContainer.commitTask() - } catch { case cause: Throwable => - logError("Aborting task.", cause) - writerContainer.abortTask() - throw new SparkException("Task failed while writing rows.", cause) - } - } - } - - // This is copied from SparkPlan, probably should move this to a more general place. - private def newProjection( - codegenEnabled: Boolean, - expressions: Seq[Expression], - inputSchema: Seq[Attribute]): Projection = { - log.debug( - s"Creating Projection: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled") - if (codegenEnabled) { - - try { - GenerateProjection.generate(expressions, inputSchema) - } catch { - case e: Exception => - if (sys.props.contains("spark.testing")) { - throw e - } else { - log.error("failed to generate projection, fallback to interpreted", e) - new InterpretedProjection(expressions, inputSchema) - } - } - } else { - new InterpretedProjection(expressions, inputSchema) - } - } -} - -private[sql] abstract class BaseWriterContainer( - @transient val relation: HadoopFsRelation, - @transient job: Job, - isAppend: Boolean) - extends SparkHadoopMapReduceUtil - with Logging - with Serializable { - - protected val serializableConf = new SerializableConfiguration(job.getConfiguration) - - // This UUID is used to avoid output file name collision between different appending write jobs. - // These jobs may belong to different SparkContext instances. Concrete data source implementations - // may use this UUID to generate unique file names (e.g., `part-r--.parquet`). - // The reason why this ID is used to identify a job rather than a single task output file is - // that, speculative tasks must generate the same output file name as the original task. - private val uniqueWriteJobId = UUID.randomUUID() - - // This is only used on driver side. - @transient private val jobContext: JobContext = job - - // The following fields are initialized and used on both driver and executor side. - @transient protected var outputCommitter: OutputCommitter = _ - @transient private var jobId: JobID = _ - @transient private var taskId: TaskID = _ - @transient private var taskAttemptId: TaskAttemptID = _ - @transient protected var taskAttemptContext: TaskAttemptContext = _ - - protected val outputPath: String = { - assert( - relation.paths.length == 1, - s"Cannot write to multiple destinations: ${relation.paths.mkString(",")}") - relation.paths.head - } - - protected val dataSchema = relation.dataSchema - - protected var outputWriterFactory: OutputWriterFactory = _ - - private var outputFormatClass: Class[_ <: OutputFormat[_, _]] = _ - - def driverSideSetup(): Unit = { - setupIDs(0, 0, 0) - setupConf() - - // This UUID is sent to executor side together with the serialized `Configuration` object within - // the `Job` instance. `OutputWriters` on the executor side should use this UUID to generate - // unique task output files. - job.getConfiguration.set("spark.sql.sources.writeJobUUID", uniqueWriteJobId.toString) - - // Order of the following two lines is important. For Hadoop 1, TaskAttemptContext constructor - // clones the Configuration object passed in. If we initialize the TaskAttemptContext first, - // configurations made in prepareJobForWrite(job) are not populated into the TaskAttemptContext. - // - // Also, the `prepareJobForWrite` call must happen before initializing output format and output - // committer, since their initialization involve the job configuration, which can be potentially - // decorated in `prepareJobForWrite`. - outputWriterFactory = relation.prepareJobForWrite(job) - taskAttemptContext = newTaskAttemptContext(serializableConf.value, taskAttemptId) - - outputFormatClass = job.getOutputFormatClass - outputCommitter = newOutputCommitter(taskAttemptContext) - outputCommitter.setupJob(jobContext) - } - - def executorSideSetup(taskContext: TaskContext): Unit = { - setupIDs(taskContext.stageId(), taskContext.partitionId(), taskContext.attemptNumber()) - setupConf() - taskAttemptContext = newTaskAttemptContext(serializableConf.value, taskAttemptId) - outputCommitter = newOutputCommitter(taskAttemptContext) - outputCommitter.setupTask(taskAttemptContext) - initWriters() - } - - protected def getWorkPath: String = { - outputCommitter match { - // FileOutputCommitter writes to a temporary location returned by `getWorkPath`. - case f: MapReduceFileOutputCommitter => f.getWorkPath.toString - case _ => outputPath - } - } - - private def newOutputCommitter(context: TaskAttemptContext): OutputCommitter = { - val defaultOutputCommitter = outputFormatClass.newInstance().getOutputCommitter(context) - - if (isAppend) { - // If we are appending data to an existing dir, we will only use the output committer - // associated with the file output format since it is not safe to use a custom - // committer for appending. For example, in S3, direct parquet output committer may - // leave partial data in the destination dir when the the appending job fails. - logInfo( - s"Using output committer class ${defaultOutputCommitter.getClass.getCanonicalName} " + - "for appending.") - defaultOutputCommitter - } else { - val committerClass = context.getConfiguration.getClass( - SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter]) - - Option(committerClass).map { clazz => - logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}") - - // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat - // has an associated output committer. To override this output committer, - // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS. - // If a data source needs to override the output committer, it needs to set the - // output committer in prepareForWrite method. - if (classOf[MapReduceFileOutputCommitter].isAssignableFrom(clazz)) { - // The specified output committer is a FileOutputCommitter. - // So, we will use the FileOutputCommitter-specified constructor. - val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) - ctor.newInstance(new Path(outputPath), context) - } else { - // The specified output committer is just a OutputCommitter. - // So, we will use the no-argument constructor. - val ctor = clazz.getDeclaredConstructor() - ctor.newInstance() - } - }.getOrElse { - // If output committer class is not set, we will use the one associated with the - // file output format. - logInfo( - s"Using output committer class ${defaultOutputCommitter.getClass.getCanonicalName}") - defaultOutputCommitter - } - } - } - - private def setupIDs(jobId: Int, splitId: Int, attemptId: Int): Unit = { - this.jobId = SparkHadoopWriter.createJobID(new Date, jobId) - this.taskId = new TaskID(this.jobId, true, splitId) - this.taskAttemptId = new TaskAttemptID(taskId, attemptId) - } - - private def setupConf(): Unit = { - serializableConf.value.set("mapred.job.id", jobId.toString) - serializableConf.value.set("mapred.tip.id", taskAttemptId.getTaskID.toString) - serializableConf.value.set("mapred.task.id", taskAttemptId.toString) - serializableConf.value.setBoolean("mapred.task.is.map", true) - serializableConf.value.setInt("mapred.task.partition", 0) - } - - // Called on executor side when writing rows - def outputWriterForRow(row: InternalRow): OutputWriter - - protected def initWriters(): Unit - - def commitTask(): Unit = { - SparkHadoopMapRedUtil.commitTask( - outputCommitter, taskAttemptContext, jobId.getId, taskId.getId, taskAttemptId.getId) - } - - def abortTask(): Unit = { - if (outputCommitter != null) { - outputCommitter.abortTask(taskAttemptContext) - } - logError(s"Task attempt $taskAttemptId aborted.") - } - - def commitJob(): Unit = { - outputCommitter.commitJob(jobContext) - logInfo(s"Job $jobId committed.") - } - - def abortJob(): Unit = { - if (outputCommitter != null) { - outputCommitter.abortJob(jobContext, JobStatus.State.FAILED) - } - logError(s"Job $jobId aborted.") - } -} - -private[sql] class DefaultWriterContainer( - @transient relation: HadoopFsRelation, - @transient job: Job, - isAppend: Boolean) - extends BaseWriterContainer(relation, job, isAppend) { - - @transient private var writer: OutputWriter = _ - - override protected def initWriters(): Unit = { - taskAttemptContext.getConfiguration.set("spark.sql.sources.output.path", outputPath) - writer = outputWriterFactory.newInstance(getWorkPath, dataSchema, taskAttemptContext) - } - - override def outputWriterForRow(row: InternalRow): OutputWriter = writer - - override def commitTask(): Unit = { - try { - assert(writer != null, "OutputWriter instance should have been initialized") - writer.close() - super.commitTask() - } catch { case cause: Throwable => - // This exception will be handled in `InsertIntoHadoopFsRelation.insert$writeRows`, and will - // cause `abortTask()` to be invoked. - throw new RuntimeException("Failed to commit task", cause) - } - } - - override def abortTask(): Unit = { - try { - // It's possible that the task fails before `writer` gets initialized - if (writer != null) { - writer.close() - } - } finally { - super.abortTask() - } - } -} - -private[sql] class DynamicPartitionWriterContainer( - @transient relation: HadoopFsRelation, - @transient job: Job, - partitionColumns: Array[String], - defaultPartitionName: String, - isAppend: Boolean) - extends BaseWriterContainer(relation, job, isAppend) { - - // All output writers are created on executor side. - @transient protected var outputWriters: java.util.HashMap[String, OutputWriter] = _ - - override protected def initWriters(): Unit = { - outputWriters = new java.util.HashMap[String, OutputWriter] - } - - // The `row` argument is supposed to only contain partition column values which have been casted - // to strings. - override def outputWriterForRow(row: InternalRow): OutputWriter = { - val partitionPath = { - val partitionPathBuilder = new StringBuilder - var i = 0 - - while (i < partitionColumns.length) { - val col = partitionColumns(i) - val partitionValueString = { - val string = row.getUTF8String(i) - if (string.eq(null)) { - defaultPartitionName - } else { - PartitioningUtils.escapePathName(string.toString) - } - } - - if (i > 0) { - partitionPathBuilder.append(Path.SEPARATOR_CHAR) - } - - partitionPathBuilder.append(s"$col=$partitionValueString") - i += 1 - } - - partitionPathBuilder.toString() - } - - val writer = outputWriters.get(partitionPath) - if (writer.eq(null)) { - val path = new Path(getWorkPath, partitionPath) - taskAttemptContext.getConfiguration.set( - "spark.sql.sources.output.path", new Path(outputPath, partitionPath).toString) - val newWriter = outputWriterFactory.newInstance(path.toString, dataSchema, taskAttemptContext) - outputWriters.put(partitionPath, newWriter) - newWriter - } else { - writer - } - } - - private def clearOutputWriters(): Unit = { - if (!outputWriters.isEmpty) { - asScalaIterator(outputWriters.values().iterator()).foreach(_.close()) - outputWriters.clear() - } - } - - override def commitTask(): Unit = { - try { - clearOutputWriters() - super.commitTask() - } catch { case cause: Throwable => - throw new RuntimeException("Failed to commit task", cause) - } - } - - override def abortTask(): Unit = { - try { - clearOutputWriters() - } finally { - super.abortTask() - } - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala index 0cdb407ad57b..31d6b75e1347 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala @@ -17,340 +17,12 @@ package org.apache.spark.sql.execution.datasources -import scala.language.{existentials, implicitConversions} -import scala.util.matching.Regex - -import org.apache.hadoop.fs.Path - -import org.apache.spark.Logging -import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.{AbstractSparkSQLParser, TableIdentifier} import org.apache.spark.sql.execution.RunnableCommand -import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SQLContext, SaveMode} -import org.apache.spark.util.Utils - -/** - * A parser for foreign DDL commands. - */ -private[sql] class DDLParser( - parseQuery: String => LogicalPlan) - extends AbstractSparkSQLParser with DataTypeParser with Logging { - - def parse(input: String, exceptionOnError: Boolean): LogicalPlan = { - try { - parse(input) - } catch { - case ddlException: DDLException => throw ddlException - case _ if !exceptionOnError => parseQuery(input) - case x: Throwable => throw x - } - } - - // Keyword is a convention with AbstractSparkSQLParser, which will scan all of the `Keyword` - // properties via reflection the class in runtime for constructing the SqlLexical object - protected val CREATE = Keyword("CREATE") - protected val TEMPORARY = Keyword("TEMPORARY") - protected val TABLE = Keyword("TABLE") - protected val IF = Keyword("IF") - protected val NOT = Keyword("NOT") - protected val EXISTS = Keyword("EXISTS") - protected val USING = Keyword("USING") - protected val OPTIONS = Keyword("OPTIONS") - protected val DESCRIBE = Keyword("DESCRIBE") - protected val EXTENDED = Keyword("EXTENDED") - protected val AS = Keyword("AS") - protected val COMMENT = Keyword("COMMENT") - protected val REFRESH = Keyword("REFRESH") - - protected lazy val ddl: Parser[LogicalPlan] = createTable | describeTable | refreshTable - - protected def start: Parser[LogicalPlan] = ddl - - /** - * `CREATE [TEMPORARY] TABLE avroTable [IF NOT EXISTS] - * USING org.apache.spark.sql.avro - * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")` - * or - * `CREATE [TEMPORARY] TABLE avroTable(intField int, stringField string...) [IF NOT EXISTS] - * USING org.apache.spark.sql.avro - * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")` - * or - * `CREATE [TEMPORARY] TABLE avroTable [IF NOT EXISTS] - * USING org.apache.spark.sql.avro - * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")` - * AS SELECT ... - */ - protected lazy val createTable: Parser[LogicalPlan] = - // TODO: Support database.table. - (CREATE ~> TEMPORARY.? <~ TABLE) ~ (IF ~> NOT <~ EXISTS).? ~ ident ~ - tableCols.? ~ (USING ~> className) ~ (OPTIONS ~> options).? ~ (AS ~> restInput).? ^^ { - case temp ~ allowExisting ~ tableName ~ columns ~ provider ~ opts ~ query => - if (temp.isDefined && allowExisting.isDefined) { - throw new DDLException( - "a CREATE TEMPORARY TABLE statement does not allow IF NOT EXISTS clause.") - } - - val options = opts.getOrElse(Map.empty[String, String]) - if (query.isDefined) { - if (columns.isDefined) { - throw new DDLException( - "a CREATE TABLE AS SELECT statement does not allow column definitions.") - } - // When IF NOT EXISTS clause appears in the query, the save mode will be ignore. - val mode = if (allowExisting.isDefined) { - SaveMode.Ignore - } else if (temp.isDefined) { - SaveMode.Overwrite - } else { - SaveMode.ErrorIfExists - } - - val queryPlan = parseQuery(query.get) - CreateTableUsingAsSelect(tableName, - provider, - temp.isDefined, - Array.empty[String], - mode, - options, - queryPlan) - } else { - val userSpecifiedSchema = columns.flatMap(fields => Some(StructType(fields))) - CreateTableUsing( - tableName, - userSpecifiedSchema, - provider, - temp.isDefined, - options, - allowExisting.isDefined, - managedIfNoPath = false) - } - } - - protected lazy val tableCols: Parser[Seq[StructField]] = "(" ~> repsep(column, ",") <~ ")" - - /* - * describe [extended] table avroTable - * This will display all columns of table `avroTable` includes column_name,column_type,comment - */ - protected lazy val describeTable: Parser[LogicalPlan] = - (DESCRIBE ~> opt(EXTENDED)) ~ (ident <~ ".").? ~ ident ^^ { - case e ~ db ~ tbl => - val tblIdentifier = db match { - case Some(dbName) => - Seq(dbName, tbl) - case None => - Seq(tbl) - } - DescribeCommand(UnresolvedRelation(tblIdentifier, None), e.isDefined) - } - - protected lazy val refreshTable: Parser[LogicalPlan] = - REFRESH ~> TABLE ~> (ident <~ ".").? ~ ident ^^ { - case maybeDatabaseName ~ tableName => - RefreshTable(TableIdentifier(tableName, maybeDatabaseName)) - } - - protected lazy val options: Parser[Map[String, String]] = - "(" ~> repsep(pair, ",") <~ ")" ^^ { case s: Seq[(String, String)] => s.toMap } - - protected lazy val className: Parser[String] = repsep(ident, ".") ^^ { case s => s.mkString(".")} - - override implicit def regexToParser(regex: Regex): Parser[String] = acceptMatch( - s"identifier matching regex $regex", { - case lexical.Identifier(str) if regex.unapplySeq(str).isDefined => str - case lexical.Keyword(str) if regex.unapplySeq(str).isDefined => str - } - ) - - protected lazy val optionPart: Parser[String] = "[_a-zA-Z][_a-zA-Z0-9]*".r ^^ { - case name => name - } - - protected lazy val optionName: Parser[String] = repsep(optionPart, ".") ^^ { - case parts => parts.mkString(".") - } - - protected lazy val pair: Parser[(String, String)] = - optionName ~ stringLit ^^ { case k ~ v => (k, v) } - - protected lazy val column: Parser[StructField] = - ident ~ dataType ~ (COMMENT ~> stringLit).? ^^ { case columnName ~ typ ~ cm => - val meta = cm match { - case Some(comment) => - new MetadataBuilder().putString(COMMENT.str.toLowerCase, comment).build() - case None => Metadata.empty - } - - StructField(columnName, typ, nullable = true, meta) - } -} - -private[sql] object ResolvedDataSource { - - private val builtinSources = Map( - "jdbc" -> "org.apache.spark.sql.jdbc.DefaultSource", - "json" -> "org.apache.spark.sql.json.DefaultSource", - "parquet" -> "org.apache.spark.sql.parquet.DefaultSource", - "orc" -> "org.apache.spark.sql.hive.orc.DefaultSource" - ) - - /** Given a provider name, look up the data source class definition. */ - def lookupDataSource(provider: String): Class[_] = { - val loader = Utils.getContextOrSparkClassLoader - - if (builtinSources.contains(provider)) { - return loader.loadClass(builtinSources(provider)) - } - - try { - loader.loadClass(provider) - } catch { - case cnf: java.lang.ClassNotFoundException => - try { - loader.loadClass(provider + ".DefaultSource") - } catch { - case cnf: java.lang.ClassNotFoundException => - if (provider.startsWith("org.apache.spark.sql.hive.orc")) { - sys.error("The ORC data source must be used with Hive support enabled.") - } else { - sys.error(s"Failed to load class for data source: $provider") - } - } - } - } - - /** Create a [[ResolvedDataSource]] for reading data in. */ - def apply( - sqlContext: SQLContext, - userSpecifiedSchema: Option[StructType], - partitionColumns: Array[String], - provider: String, - options: Map[String, String]): ResolvedDataSource = { - val clazz: Class[_] = lookupDataSource(provider) - def className: String = clazz.getCanonicalName - val relation = userSpecifiedSchema match { - case Some(schema: StructType) => clazz.newInstance() match { - case dataSource: SchemaRelationProvider => - dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options), schema) - case dataSource: HadoopFsRelationProvider => - val maybePartitionsSchema = if (partitionColumns.isEmpty) { - None - } else { - Some(partitionColumnsSchema(schema, partitionColumns)) - } - - val caseInsensitiveOptions = new CaseInsensitiveMap(options) - val paths = { - val patternPath = new Path(caseInsensitiveOptions("path")) - val fs = patternPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) - val qualifiedPattern = patternPath.makeQualified(fs.getUri, fs.getWorkingDirectory) - SparkHadoopUtil.get.globPathIfNecessary(qualifiedPattern).map(_.toString).toArray - } - - val dataSchema = - StructType(schema.filterNot(f => partitionColumns.contains(f.name))).asNullable - - dataSource.createRelation( - sqlContext, - paths, - Some(dataSchema), - maybePartitionsSchema, - caseInsensitiveOptions) - case dataSource: org.apache.spark.sql.sources.RelationProvider => - throw new AnalysisException(s"$className does not allow user-specified schemas.") - case _ => - throw new AnalysisException(s"$className is not a RelationProvider.") - } - - case None => clazz.newInstance() match { - case dataSource: RelationProvider => - dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options)) - case dataSource: HadoopFsRelationProvider => - val caseInsensitiveOptions = new CaseInsensitiveMap(options) - val paths = { - val patternPath = new Path(caseInsensitiveOptions("path")) - val fs = patternPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) - val qualifiedPattern = patternPath.makeQualified(fs.getUri, fs.getWorkingDirectory) - SparkHadoopUtil.get.globPathIfNecessary(qualifiedPattern).map(_.toString).toArray - } - dataSource.createRelation(sqlContext, paths, None, None, caseInsensitiveOptions) - case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider => - throw new AnalysisException( - s"A schema needs to be specified when using $className.") - case _ => - throw new AnalysisException( - s"$className is neither a RelationProvider nor a FSBasedRelationProvider.") - } - } - new ResolvedDataSource(clazz, relation) - } - - private def partitionColumnsSchema( - schema: StructType, - partitionColumns: Array[String]): StructType = { - StructType(partitionColumns.map { col => - schema.find(_.name == col).getOrElse { - throw new RuntimeException(s"Partition column $col not found in schema $schema") - } - }).asNullable - } - - /** Create a [[ResolvedDataSource]] for saving the content of the given [[DataFrame]]. */ - def apply( - sqlContext: SQLContext, - provider: String, - partitionColumns: Array[String], - mode: SaveMode, - options: Map[String, String], - data: DataFrame): ResolvedDataSource = { - if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) { - throw new AnalysisException("Cannot save interval data type into external storage.") - } - val clazz: Class[_] = lookupDataSource(provider) - val relation = clazz.newInstance() match { - case dataSource: CreatableRelationProvider => - dataSource.createRelation(sqlContext, mode, options, data) - case dataSource: HadoopFsRelationProvider => - // Don't glob path for the write path. The contracts here are: - // 1. Only one output path can be specified on the write path; - // 2. Output path must be a legal HDFS style file system path; - // 3. It's OK that the output path doesn't exist yet; - val caseInsensitiveOptions = new CaseInsensitiveMap(options) - val outputPath = { - val path = new Path(caseInsensitiveOptions("path")) - val fs = path.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) - path.makeQualified(fs.getUri, fs.getWorkingDirectory) - } - val dataSchema = StructType(data.schema.filterNot(f => partitionColumns.contains(f.name))) - val r = dataSource.createRelation( - sqlContext, - Array(outputPath.toString), - Some(dataSchema.asNullable), - Some(partitionColumnsSchema(data.schema, partitionColumns)), - caseInsensitiveOptions) - - // For partitioned relation r, r.schema's column ordering can be different from the column - // ordering of data.logicalPlan (partition columns are all moved after data column). This - // will be adjusted within InsertIntoHadoopFsRelation. - sqlContext.executePlan( - InsertIntoHadoopFsRelation( - r, - data.logicalPlan, - mode)).toRdd - r - case _ => - sys.error(s"${clazz.getCanonicalName} does not allow create table as select.") - } - new ResolvedDataSource(clazz, relation) - } -} - -private[sql] case class ResolvedDataSource(provider: Class[_], relation: BaseRelation) +import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode} /** * Returned for the "DESCRIBE [EXTENDED] [dbName.]tableName" command. @@ -358,11 +30,12 @@ private[sql] case class ResolvedDataSource(provider: Class[_], relation: BaseRel * @param isExtended True if "DESCRIBE EXTENDED" is used. Otherwise, false. * It is effective only when the table is a Hive table. */ -private[sql] case class DescribeCommand( +case class DescribeCommand( table: LogicalPlan, isExtended: Boolean) extends LogicalPlan with Command { override def children: Seq[LogicalPlan] = Seq.empty + override val output: Seq[Attribute] = Seq( // Column names are based on Hive. AttributeReference("col_name", StringType, nullable = false, @@ -370,7 +43,8 @@ private[sql] case class DescribeCommand( AttributeReference("data_type", StringType, nullable = false, new MetadataBuilder().putString("comment", "data type of the column").build())(), AttributeReference("comment", StringType, nullable = false, - new MetadataBuilder().putString("comment", "comment of the column").build())()) + new MetadataBuilder().putString("comment", "comment of the column").build())() + ) } /** @@ -378,8 +52,8 @@ private[sql] case class DescribeCommand( * @param allowExisting If it is true, we will do nothing when the table already exists. * If it is false, an exception will be thrown */ -private[sql] case class CreateTableUsing( - tableName: String, +case class CreateTableUsing( + tableIdent: TableIdentifier, userSpecifiedSchema: Option[StructType], provider: String, temporary: Boolean, @@ -397,8 +71,9 @@ private[sql] case class CreateTableUsing( * can analyze the logical plan that will be used to populate the table. * So, [[PreWriteCheck]] can detect cases that are not allowed. */ -private[sql] case class CreateTableUsingAsSelect( - tableName: String, +// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104). +case class CreateTableUsingAsSelect( + tableIdent: TableIdentifier, provider: String, temporary: Boolean, partitionColumns: Array[String], @@ -406,12 +81,10 @@ private[sql] case class CreateTableUsingAsSelect( options: Map[String, String], child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = Seq.empty[Attribute] - // TODO: Override resolved after we support databaseName. - // override lazy val resolved = databaseName != None && childrenResolved } -private[sql] case class CreateTempTableUsing( - tableName: String, +case class CreateTempTableUsing( + tableIdent: TableIdentifier, userSpecifiedSchema: Option[StructType], provider: String, options: Map[String, String]) extends RunnableCommand { @@ -419,14 +92,16 @@ private[sql] case class CreateTempTableUsing( def run(sqlContext: SQLContext): Seq[Row] = { val resolved = ResolvedDataSource( sqlContext, userSpecifiedSchema, Array.empty[String], provider, options) - sqlContext.registerDataFrameAsTable( - DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName) + sqlContext.catalog.registerTable( + tableIdent.toSeq, + DataFrame(sqlContext, LogicalRelation(resolved.relation)).logicalPlan) + Seq.empty[Row] } } -private[sql] case class CreateTempTableUsingAsSelect( - tableName: String, +case class CreateTempTableUsingAsSelect( + tableIdent: TableIdentifier, provider: String, partitionColumns: Array[String], mode: SaveMode, @@ -436,14 +111,15 @@ private[sql] case class CreateTempTableUsingAsSelect( override def run(sqlContext: SQLContext): Seq[Row] = { val df = DataFrame(sqlContext, query) val resolved = ResolvedDataSource(sqlContext, provider, partitionColumns, mode, options, df) - sqlContext.registerDataFrameAsTable( - DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName) + sqlContext.catalog.registerTable( + tableIdent.toSeq, + DataFrame(sqlContext, LogicalRelation(resolved.relation)).logicalPlan) Seq.empty[Row] } } -private[sql] case class RefreshTable(tableIdent: TableIdentifier) +case class RefreshTable(tableIdent: TableIdentifier) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { @@ -472,7 +148,7 @@ private[sql] case class RefreshTable(tableIdent: TableIdentifier) /** * Builds a map in which keys are case insensitive */ -protected[sql] class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String] +class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String] with Serializable { val baseMap = map.map(kv => kv.copy(_1 = kv._1.toLowerCase)) @@ -490,4 +166,4 @@ protected[sql] class CaseInsensitiveMap(map: Map[String, String]) extends Map[St /** * The exception thrown from the DDL parser. */ -protected[sql] class DDLException(message: String) extends Exception(message) +class DDLException(message: String) extends RuntimeException(message) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DefaultSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DefaultSource.scala new file mode 100644 index 000000000000..6773afc794f9 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DefaultSource.scala @@ -0,0 +1,62 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql.execution.datasources.jdbc + +import java.util.Properties + +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, DataSourceRegister} + +class DefaultSource extends RelationProvider with DataSourceRegister { + + override def shortName(): String = "jdbc" + + /** Returns a new base relation with the given parameters. */ + override def createRelation( + sqlContext: SQLContext, + parameters: Map[String, String]): BaseRelation = { + val url = parameters.getOrElse("url", sys.error("Option 'url' not specified")) + val driver = parameters.getOrElse("driver", null) + val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified")) + val partitionColumn = parameters.getOrElse("partitionColumn", null) + val lowerBound = parameters.getOrElse("lowerBound", null) + val upperBound = parameters.getOrElse("upperBound", null) + val numPartitions = parameters.getOrElse("numPartitions", null) + + if (driver != null) DriverRegistry.register(driver) + + if (partitionColumn != null + && (lowerBound == null || upperBound == null || numPartitions == null)) { + sys.error("Partitioning incompletely specified") + } + + val partitionInfo = if (partitionColumn == null) { + null + } else { + JDBCPartitioningInfo( + partitionColumn, + lowerBound.toLong, + upperBound.toLong, + numPartitions.toInt) + } + val parts = JDBCRelation.columnPartition(partitionInfo) + val properties = new Properties() // Additional properties that we will pass to getConnection + parameters.foreach(kv => properties.setProperty(kv._1, kv._2)) + JDBCRelation(url, table, parts, properties)(sqlContext) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DriverRegistry.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DriverRegistry.scala new file mode 100644 index 000000000000..7ccd61ed469e --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DriverRegistry.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.jdbc + +import java.sql.{Driver, DriverManager} + +import scala.collection.mutable + +import org.apache.spark.Logging +import org.apache.spark.util.Utils + +/** + * java.sql.DriverManager is always loaded by bootstrap classloader, + * so it can't load JDBC drivers accessible by Spark ClassLoader. + * + * To solve the problem, drivers from user-supplied jars are wrapped into thin wrapper. + */ +object DriverRegistry extends Logging { + + private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty + + def register(className: String): Unit = { + val cls = Utils.getContextOrSparkClassLoader.loadClass(className) + if (cls.getClassLoader == null) { + logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required") + } else if (wrapperMap.get(className).isDefined) { + logTrace(s"Wrapper for $className already exists") + } else { + synchronized { + if (wrapperMap.get(className).isEmpty) { + val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver]) + DriverManager.registerDriver(wrapper) + wrapperMap(className) = wrapper + logTrace(s"Wrapper for $className registered") + } + } + } + } + + def getDriverClassName(url: String): String = DriverManager.getDriver(url) match { + case wrapper: DriverWrapper => wrapper.wrapped.getClass.getCanonicalName + case driver => driver.getClass.getCanonicalName + } +} + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DriverWrapper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DriverWrapper.scala new file mode 100644 index 000000000000..18263fe227d0 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DriverWrapper.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.jdbc + +import java.sql.{Connection, Driver, DriverPropertyInfo, SQLFeatureNotSupportedException} +import java.util.Properties + +/** + * A wrapper for a JDBC Driver to work around SPARK-6913. + * + * The problem is in `java.sql.DriverManager` class that can't access drivers loaded by + * Spark ClassLoader. + */ +class DriverWrapper(val wrapped: Driver) extends Driver { + override def acceptsURL(url: String): Boolean = wrapped.acceptsURL(url) + + override def jdbcCompliant(): Boolean = wrapped.jdbcCompliant() + + override def getPropertyInfo(url: String, info: Properties): Array[DriverPropertyInfo] = { + wrapped.getPropertyInfo(url, info) + } + + override def getMinorVersion: Int = wrapped.getMinorVersion + + def getParentLogger: java.util.logging.Logger = { + throw new SQLFeatureNotSupportedException( + s"${this.getClass.getName}.getParentLogger is not yet implemented.") + } + + override def connect(url: String, info: Properties): Connection = wrapped.connect(url, info) + + override def getMajorVersion: Int = wrapped.getMajorVersion +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala similarity index 97% rename from sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index 3cf70db6b7b0..730d88b024cb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.jdbc +package org.apache.spark.sql.execution.datasources.jdbc import java.sql.{Connection, DriverManager, ResultSet, ResultSetMetaData, SQLException} import java.util.Properties @@ -26,6 +26,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.jdbc.JdbcDialects import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -117,7 +118,7 @@ private[sql] object JDBCRDD extends Logging { */ def resolveTable(url: String, table: String, properties: Properties): StructType = { val dialect = JdbcDialects.get(url) - val conn: Connection = DriverManager.getConnection(url, properties) + val conn: Connection = getConnector(properties.getProperty("driver"), url, properties)() try { val rs = conn.prepareStatement(s"SELECT * FROM $table WHERE 1=0").executeQuery() try { @@ -170,7 +171,8 @@ private[sql] object JDBCRDD extends Logging { * getConnector is run on the driver code, while the function it returns * is run on the executor. * - * @param driver - The class name of the JDBC driver for the given url. + * @param driver - The class name of the JDBC driver for the given url, or null if the class name + * is not necessary. * @param url - The JDBC url to connect to. * * @return A function that loads the driver and connects to the url. @@ -180,9 +182,8 @@ private[sql] object JDBCRDD extends Logging { try { if (driver != null) DriverRegistry.register(driver) } catch { - case e: ClassNotFoundException => { - logWarning(s"Couldn't find class $driver", e); - } + case e: ClassNotFoundException => + logWarning(s"Couldn't find class $driver", e) } DriverManager.getConnection(url, properties) } @@ -261,7 +262,7 @@ private[sql] class JDBCRDD( * Converts value to SQL expression. */ private def compileValue(value: Any): Any = value match { - case stringValue: UTF8String => s"'${escapeSql(stringValue.toString)}'" + case stringValue: String => s"'${escapeSql(stringValue)}'" case _ => value } @@ -344,7 +345,6 @@ private[sql] class JDBCRDD( }).toArray } - /** * Runs the SQL query against the JDBC driver. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala similarity index 72% rename from sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala index 41d0ecb4bbfb..f9300dc2cb52 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.jdbc +package org.apache.spark.sql.execution.datasources.jdbc import java.util.Properties @@ -77,42 +77,6 @@ private[sql] object JDBCRelation { } } -private[sql] class DefaultSource extends RelationProvider { - /** Returns a new base relation with the given parameters. */ - override def createRelation( - sqlContext: SQLContext, - parameters: Map[String, String]): BaseRelation = { - val url = parameters.getOrElse("url", sys.error("Option 'url' not specified")) - val driver = parameters.getOrElse("driver", null) - val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified")) - val partitionColumn = parameters.getOrElse("partitionColumn", null) - val lowerBound = parameters.getOrElse("lowerBound", null) - val upperBound = parameters.getOrElse("upperBound", null) - val numPartitions = parameters.getOrElse("numPartitions", null) - - if (driver != null) DriverRegistry.register(driver) - - if (partitionColumn != null - && (lowerBound == null || upperBound == null || numPartitions == null)) { - sys.error("Partitioning incompletely specified") - } - - val partitionInfo = if (partitionColumn == null) { - null - } else { - JDBCPartitioningInfo( - partitionColumn, - lowerBound.toLong, - upperBound.toLong, - numPartitions.toInt) - } - val parts = JDBCRelation.columnPartition(partitionInfo) - val properties = new Properties() // Additional properties that we will pass to getConnection - parameters.foreach(kv => properties.setProperty(kv._1, kv._2)) - JDBCRelation(url, table, parts, properties)(sqlContext) - } -} - private[sql] case class JDBCRelation( url: String, table: String, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala new file mode 100644 index 000000000000..2d0e736ee476 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.jdbc + +import java.sql.{Connection, PreparedStatement} +import java.util.Properties + +import scala.util.Try + +import org.apache.spark.Logging +import org.apache.spark.sql.jdbc.JdbcDialects +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row} + +/** + * Util functions for JDBC tables. + */ +object JdbcUtils extends Logging { + + /** + * Establishes a JDBC connection. + */ + def createConnection(url: String, connectionProperties: Properties): Connection = { + JDBCRDD.getConnector(connectionProperties.getProperty("driver"), url, connectionProperties)() + } + + /** + * Returns true if the table already exists in the JDBC database. + */ + def tableExists(conn: Connection, table: String): Boolean = { + // Somewhat hacky, but there isn't a good way to identify whether a table exists for all + // SQL database systems, considering "table" could also include the database name. + Try(conn.prepareStatement(s"SELECT 1 FROM $table LIMIT 1").executeQuery().next()).isSuccess + } + + /** + * Drops a table from the JDBC database. + */ + def dropTable(conn: Connection, table: String): Unit = { + conn.prepareStatement(s"DROP TABLE $table").executeUpdate() + } + + /** + * Returns a PreparedStatement that inserts a row into table via conn. + */ + def insertStatement(conn: Connection, table: String, rddSchema: StructType): PreparedStatement = { + val sql = new StringBuilder(s"INSERT INTO $table VALUES (") + var fieldsLeft = rddSchema.fields.length + while (fieldsLeft > 0) { + sql.append("?") + if (fieldsLeft > 1) sql.append(", ") else sql.append(")") + fieldsLeft = fieldsLeft - 1 + } + conn.prepareStatement(sql.toString()) + } + + /** + * Saves a partition of a DataFrame to the JDBC database. This is done in + * a single database transaction in order to avoid repeatedly inserting + * data as much as possible. + * + * It is still theoretically possible for rows in a DataFrame to be + * inserted into the database more than once if a stage somehow fails after + * the commit occurs but before the stage can return successfully. + * + * This is not a closure inside saveTable() because apparently cosmetic + * implementation changes elsewhere might easily render such a closure + * non-Serializable. Instead, we explicitly close over all variables that + * are used. + */ + def savePartition( + getConnection: () => Connection, + table: String, + iterator: Iterator[Row], + rddSchema: StructType, + nullTypes: Array[Int]): Iterator[Byte] = { + val conn = getConnection() + var committed = false + try { + conn.setAutoCommit(false) // Everything in the same db transaction. + val stmt = insertStatement(conn, table, rddSchema) + try { + while (iterator.hasNext) { + val row = iterator.next() + val numFields = rddSchema.fields.length + var i = 0 + while (i < numFields) { + if (row.isNullAt(i)) { + stmt.setNull(i + 1, nullTypes(i)) + } else { + rddSchema.fields(i).dataType match { + case IntegerType => stmt.setInt(i + 1, row.getInt(i)) + case LongType => stmt.setLong(i + 1, row.getLong(i)) + case DoubleType => stmt.setDouble(i + 1, row.getDouble(i)) + case FloatType => stmt.setFloat(i + 1, row.getFloat(i)) + case ShortType => stmt.setInt(i + 1, row.getShort(i)) + case ByteType => stmt.setInt(i + 1, row.getByte(i)) + case BooleanType => stmt.setBoolean(i + 1, row.getBoolean(i)) + case StringType => stmt.setString(i + 1, row.getString(i)) + case BinaryType => stmt.setBytes(i + 1, row.getAs[Array[Byte]](i)) + case TimestampType => stmt.setTimestamp(i + 1, row.getAs[java.sql.Timestamp](i)) + case DateType => stmt.setDate(i + 1, row.getAs[java.sql.Date](i)) + case t: DecimalType => stmt.setBigDecimal(i + 1, row.getDecimal(i)) + case _ => throw new IllegalArgumentException( + s"Can't translate non-null value for field $i") + } + } + i = i + 1 + } + stmt.executeUpdate() + } + } finally { + stmt.close() + } + conn.commit() + committed = true + } finally { + if (!committed) { + // The stage must fail. We got here through an exception path, so + // let the exception through unless rollback() or close() want to + // tell the user about another problem. + conn.rollback() + conn.close() + } else { + // The stage must succeed. We cannot propagate any exception close() might throw. + try { + conn.close() + } catch { + case e: Exception => logWarning("Transaction succeeded, but closing failed", e) + } + } + } + Array[Byte]().iterator + } + + /** + * Compute the schema string for this RDD. + */ + def schemaString(df: DataFrame, url: String): String = { + val sb = new StringBuilder() + val dialect = JdbcDialects.get(url) + df.schema.fields foreach { field => { + val name = field.name + val typ: String = + dialect.getJDBCType(field.dataType).map(_.databaseTypeDefinition).getOrElse( + field.dataType match { + case IntegerType => "INTEGER" + case LongType => "BIGINT" + case DoubleType => "DOUBLE PRECISION" + case FloatType => "REAL" + case ShortType => "INTEGER" + case ByteType => "BYTE" + case BooleanType => "BIT(1)" + case StringType => "TEXT" + case BinaryType => "BLOB" + case TimestampType => "TIMESTAMP" + case DateType => "DATE" + case t: DecimalType => s"DECIMAL(${t.precision},${t.scale})" + case _ => throw new IllegalArgumentException(s"Don't know how to save $field to JDBC") + }) + val nullable = if (field.nullable) "" else "NOT NULL" + sb.append(s", $name $typ $nullable") + }} + if (sb.length < 2) "" else sb.substring(2) + } + + /** + * Saves the RDD to the database in a single transaction. + */ + def saveTable( + df: DataFrame, + url: String, + table: String, + properties: Properties = new Properties()) { + val dialect = JdbcDialects.get(url) + val nullTypes: Array[Int] = df.schema.fields.map { field => + dialect.getJDBCType(field.dataType).map(_.jdbcNullType).getOrElse( + field.dataType match { + case IntegerType => java.sql.Types.INTEGER + case LongType => java.sql.Types.BIGINT + case DoubleType => java.sql.Types.DOUBLE + case FloatType => java.sql.Types.REAL + case ShortType => java.sql.Types.INTEGER + case ByteType => java.sql.Types.INTEGER + case BooleanType => java.sql.Types.BIT + case StringType => java.sql.Types.CLOB + case BinaryType => java.sql.Types.BLOB + case TimestampType => java.sql.Types.TIMESTAMP + case DateType => java.sql.Types.DATE + case t: DecimalType => java.sql.Types.DECIMAL + case _ => throw new IllegalArgumentException( + s"Can't translate null value for field $field") + }) + } + + val rddSchema = df.schema + val driver: String = DriverRegistry.getDriverClassName(url) + val getConnection: () => Connection = JDBCRDD.getConnector(driver, url, properties) + df.foreachPartition { iterator => + savePartition(getConnection, table, iterator, rddSchema, nullTypes) + } + } + +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala similarity index 89% rename from sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala index 04ab5e221788..b6f3410bad69 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala @@ -15,13 +15,13 @@ * limitations under the License. */ -package org.apache.spark.sql.json +package org.apache.spark.sql.execution.datasources.json import com.fasterxml.jackson.core._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion -import org.apache.spark.sql.json.JacksonUtils.nextUntil +import org.apache.spark.sql.execution.datasources.json.JacksonUtils.nextUntil import org.apache.spark.sql.types._ private[sql] object InferSchema { @@ -113,8 +113,12 @@ private[sql] object InferSchema { case INT | LONG => LongType // Since we do not have a data type backed by BigInteger, // when we see a Java BigInteger, we use DecimalType. - case BIG_INTEGER | BIG_DECIMAL => DecimalType.SYSTEM_DEFAULT - case FLOAT | DOUBLE => DoubleType + case BIG_INTEGER | BIG_DECIMAL => + val v = parser.getDecimalValue + DecimalType(v.precision(), v.scale()) + case FLOAT | DOUBLE => + // TODO(davies): Should we use decimal if possible? + DoubleType } case VALUE_TRUE | VALUE_FALSE => BooleanType @@ -171,9 +175,18 @@ private[sql] object InferSchema { // Double support larger range than fixed decimal, DecimalType.Maximum should be enough // in most case, also have better precision. case (DoubleType, t: DecimalType) => - if (t == DecimalType.SYSTEM_DEFAULT) t else DoubleType + DoubleType case (t: DecimalType, DoubleType) => - if (t == DecimalType.SYSTEM_DEFAULT) t else DoubleType + DoubleType + case (t1: DecimalType, t2: DecimalType) => + val scale = math.max(t1.scale, t2.scale) + val range = math.max(t1.precision - t1.scale, t2.precision - t2.scale) + if (range + scale > 38) { + // DecimalType can't support precision > 38 + DoubleType + } else { + DecimalType(range + scale, scale) + } case (StructType(fields1), StructType(fields2)) => val newFields = (fields1 ++ fields2).groupBy(field => field.name).map { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala new file mode 100644 index 000000000000..7a49157d9e72 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.json + +import java.io.CharArrayWriter + +import com.fasterxml.jackson.core.JsonFactory +import com.google.common.base.Objects +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.io.{LongWritable, NullWritable, Text} +import org.apache.hadoop.mapred.{JobConf, TextInputFormat} +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat +import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} + +import org.apache.spark.Logging +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.mapred.SparkHadoopMapRedUtil +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.datasources.PartitionSpec +import org.apache.spark.sql.sources._ +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{AnalysisException, Row, SQLContext} +import org.apache.spark.util.SerializableConfiguration + + +class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister { + + override def shortName(): String = "json" + + override def createRelation( + sqlContext: SQLContext, + paths: Array[String], + dataSchema: Option[StructType], + partitionColumns: Option[StructType], + parameters: Map[String, String]): HadoopFsRelation = { + val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0) + + new JSONRelation(None, samplingRatio, dataSchema, None, partitionColumns, paths)(sqlContext) + } +} + +private[sql] class JSONRelation( + val inputRDD: Option[RDD[String]], + val samplingRatio: Double, + val maybeDataSchema: Option[StructType], + val maybePartitionSpec: Option[PartitionSpec], + override val userDefinedPartitionColumns: Option[StructType], + override val paths: Array[String] = Array.empty[String])(@transient val sqlContext: SQLContext) + extends HadoopFsRelation(maybePartitionSpec) { + + /** Constraints to be imposed on schema to be stored. */ + private def checkConstraints(schema: StructType): Unit = { + if (schema.fieldNames.length != schema.fieldNames.distinct.length) { + val duplicateColumns = schema.fieldNames.groupBy(identity).collect { + case (x, ys) if ys.length > 1 => "\"" + x + "\"" + }.mkString(", ") + throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " + + s"cannot save to JSON format") + } + } + + override val needConversion: Boolean = false + + private def createBaseRdd(inputPaths: Array[FileStatus]): RDD[String] = { + val job = new Job(sqlContext.sparkContext.hadoopConfiguration) + val conf = job.getConfiguration + + val paths = inputPaths.map(_.getPath) + + if (paths.nonEmpty) { + FileInputFormat.setInputPaths(job, paths: _*) + } + + sqlContext.sparkContext.hadoopRDD( + conf.asInstanceOf[JobConf], + classOf[TextInputFormat], + classOf[LongWritable], + classOf[Text]).map(_._2.toString) // get the text line + } + + override lazy val dataSchema = { + val jsonSchema = maybeDataSchema.getOrElse { + val files = cachedLeafStatuses().filterNot { status => + val name = status.getPath.getName + name.startsWith("_") || name.startsWith(".") + }.toArray + InferSchema( + inputRDD.getOrElse(createBaseRdd(files)), + samplingRatio, + sqlContext.conf.columnNameOfCorruptRecord) + } + checkConstraints(jsonSchema) + + jsonSchema + } + + override def buildScan( + requiredColumns: Array[String], + filters: Array[Filter], + inputPaths: Array[FileStatus]): RDD[Row] = { + JacksonParser( + inputRDD.getOrElse(createBaseRdd(inputPaths)), + StructType(requiredColumns.map(dataSchema(_))), + sqlContext.conf.columnNameOfCorruptRecord).asInstanceOf[RDD[Row]] + } + + override def equals(other: Any): Boolean = other match { + case that: JSONRelation => + ((inputRDD, that.inputRDD) match { + case (Some(thizRdd), Some(thatRdd)) => thizRdd eq thatRdd + case (None, None) => true + case _ => false + }) && paths.toSet == that.paths.toSet && + dataSchema == that.dataSchema && + schema == that.schema + case _ => false + } + + override def hashCode(): Int = { + Objects.hashCode( + inputRDD, + paths.toSet, + dataSchema, + schema, + partitionColumns) + } + + override def prepareJobForWrite(job: Job): OutputWriterFactory = { + new OutputWriterFactory { + override def newInstance( + path: String, + dataSchema: StructType, + context: TaskAttemptContext): OutputWriter = { + new JsonOutputWriter(path, dataSchema, context) + } + } + } +} + +private[json] class JsonOutputWriter( + path: String, + dataSchema: StructType, + context: TaskAttemptContext) + extends OutputWriter with SparkHadoopMapRedUtil with Logging { + + val writer = new CharArrayWriter() + // create the Generator without separator inserted between 2 records + val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) + + val result = new Text() + + private val recordWriter: RecordWriter[NullWritable, Text] = { + new TextOutputFormat[NullWritable, Text]() { + override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { + val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context) + val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") + val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context) + val split = taskAttemptId.getTaskID.getId + new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension") + } + }.getRecordWriter(context) + } + + override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal") + + override protected[sql] def writeInternal(row: InternalRow): Unit = { + JacksonGenerator(dataSchema, gen, row) + gen.flush() + + result.set(writer.toString) + writer.reset() + + recordWriter.write(NullWritable.get(), result) + } + + override def close(): Unit = { + gen.close() + recordWriter.close(context) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala similarity index 52% rename from sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala index 1e6b1198d245..f65c7bbd6e29 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonGenerator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala @@ -15,7 +15,10 @@ * limitations under the License. */ -package org.apache.spark.sql.json +package org.apache.spark.sql.execution.datasources.json + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.util.DateTimeUtils import scala.collection.Map @@ -74,4 +77,64 @@ private[sql] object JacksonGenerator { valWriter(rowSchema, row) } + + /** Transforms a single InternalRow to JSON using Jackson + * + * TODO: make the code shared with the other apply method. + * + * @param rowSchema the schema object used for conversion + * @param gen a JsonGenerator object + * @param row The row to convert + */ + def apply(rowSchema: StructType, gen: JsonGenerator, row: InternalRow): Unit = { + def valWriter: (DataType, Any) => Unit = { + case (_, null) | (NullType, _) => gen.writeNull() + case (StringType, v) => gen.writeString(v.toString) + case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString) + case (IntegerType, v: Int) => gen.writeNumber(v) + case (ShortType, v: Short) => gen.writeNumber(v) + case (FloatType, v: Float) => gen.writeNumber(v) + case (DoubleType, v: Double) => gen.writeNumber(v) + case (LongType, v: Long) => gen.writeNumber(v) + case (DecimalType(), v: Decimal) => gen.writeNumber(v.toJavaBigDecimal) + case (ByteType, v: Byte) => gen.writeNumber(v.toInt) + case (BinaryType, v: Array[Byte]) => gen.writeBinary(v) + case (BooleanType, v: Boolean) => gen.writeBoolean(v) + case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString) + // For UDT values, they should be in the SQL type's corresponding value type. + // We should not see values in the user-defined class at here. + // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is + // an ArrayData at here, instead of a Vector. + case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v) + + case (ArrayType(ty, _), v: ArrayData) => + gen.writeStartArray() + v.foreach(ty, (_, value) => valWriter(ty, value)) + gen.writeEndArray() + + case (MapType(kt, vt, _), v: MapData) => + gen.writeStartObject() + v.foreach(kt, vt, { (k, v) => + gen.writeFieldName(k.toString) + valWriter(vt, v) + }) + gen.writeEndObject() + + case (StructType(ty), v: InternalRow) => + gen.writeStartObject() + var i = 0 + while (i < ty.length) { + val field = ty(i) + val value = v.get(i, field.dataType) + if (value != null) { + gen.writeFieldName(field.name) + valWriter(field.dataType, value) + } + i += 1 + } + gen.writeEndObject() + } + + valWriter(rowSchema, row) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala similarity index 82% rename from sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index bf0448ee9645..ff4d8c04e8ea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.json +package org.apache.spark.sql.execution.datasources.json import java.io.ByteArrayOutputStream @@ -27,7 +27,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.json.JacksonUtils.nextUntil +import org.apache.spark.sql.execution.datasources.json.JacksonUtils.nextUntil import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -81,12 +81,39 @@ private[sql] object JacksonParser { case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, FloatType) => parser.getFloatValue + case (VALUE_STRING, FloatType) => + // Special case handling for NaN and Infinity. + val value = parser.getText + val lowerCaseValue = value.toLowerCase() + if (lowerCaseValue.equals("nan") || + lowerCaseValue.equals("infinity") || + lowerCaseValue.equals("-infinity") || + lowerCaseValue.equals("inf") || + lowerCaseValue.equals("-inf")) { + value.toFloat + } else { + sys.error(s"Cannot parse $value as FloatType.") + } + case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, DoubleType) => parser.getDoubleValue - case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, DecimalType()) => - // TODO: add fixed precision and scale handling - Decimal(parser.getDecimalValue) + case (VALUE_STRING, DoubleType) => + // Special case handling for NaN and Infinity. + val value = parser.getText + val lowerCaseValue = value.toLowerCase() + if (lowerCaseValue.equals("nan") || + lowerCaseValue.equals("infinity") || + lowerCaseValue.equals("-infinity") || + lowerCaseValue.equals("inf") || + lowerCaseValue.equals("-inf")) { + value.toDouble + } else { + sys.error(s"Cannot parse $value as DoubleType.") + } + + case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, dt: DecimalType) => + Decimal(parser.getDecimalValue, dt.precision, dt.scale) case (VALUE_NUMBER_INT, ByteType) => parser.getByteValue @@ -126,7 +153,10 @@ private[sql] object JacksonParser { convertMap(factory, parser, kt) case (_, udt: UserDefinedType[_]) => - udt.deserialize(convertField(factory, parser, udt.sqlType)) + convertField(factory, parser, udt.sqlType) + + case (token, dataType) => + sys.error(s"Failed to parse a value for data type $dataType (current token: $token).") } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonUtils.scala similarity index 95% rename from sql/core/src/main/scala/org/apache/spark/sql/json/JacksonUtils.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonUtils.scala index fde96852ce68..005546f37dda 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonUtils.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.json +package org.apache.spark.sql.execution.datasources.json import com.fasterxml.jackson.core.{JsonParser, JsonToken} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala new file mode 100644 index 000000000000..00f36caeaef4 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import java.util.{Map => JMap} + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.conf.Configuration +import org.apache.parquet.hadoop.api.ReadSupport.ReadContext +import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} +import org.apache.parquet.io.api.RecordMaterializer +import org.apache.parquet.schema.Type.Repetition +import org.apache.parquet.schema._ + +import org.apache.spark.Logging +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types._ + +private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with Logging { + // Called after `init()` when initializing Parquet record reader. + override def prepareForRead( + conf: Configuration, + keyValueMetaData: JMap[String, String], + fileSchema: MessageType, + readContext: ReadContext): RecordMaterializer[InternalRow] = { + log.debug(s"Preparing for read Parquet file with message type: $fileSchema") + + val toCatalyst = new CatalystSchemaConverter(conf) + val parquetRequestedSchema = readContext.getRequestedSchema + + val catalystRequestedSchema = + Option(readContext.getReadSupportMetadata).map(_.asScala).flatMap { metadata => + metadata + // First tries to read requested schema, which may result from projections + .get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA) + // If not available, tries to read Catalyst schema from file metadata. It's only + // available if the target file is written by Spark SQL. + .orElse(metadata.get(CatalystReadSupport.SPARK_METADATA_KEY)) + }.map(StructType.fromString).getOrElse { + logInfo("Catalyst schema not available, falling back to Parquet schema") + toCatalyst.convert(parquetRequestedSchema) + } + + logInfo { + s"""Going to read the following fields from the Parquet file: + | + |Parquet form: + |$parquetRequestedSchema + |Catalyst form: + |$catalystRequestedSchema + """.stripMargin + } + + new CatalystRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema) + } + + // Called before `prepareForRead()` when initializing Parquet record reader. + override def init(context: InitContext): ReadContext = { + val conf = context.getConfiguration + + // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst + // schema of this file from its metadata. + val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA)) + + // Optional schema of requested columns, in the form of a string serialized from a Catalyst + // `StructType` containing all requested columns. + val maybeRequestedSchema = Option(conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)) + + val parquetRequestedSchema = + maybeRequestedSchema.fold(context.getFileSchema) { schemaString => + val catalystRequestedSchema = StructType.fromString(schemaString) + CatalystReadSupport.clipParquetSchema(context.getFileSchema, catalystRequestedSchema) + } + + val metadata = + Map.empty[String, String] ++ + maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++ + maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _) + + new ReadContext(parquetRequestedSchema, metadata.asJava) + } +} + +private[parquet] object CatalystReadSupport { + val SPARK_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema" + + val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata" + + /** + * Tailors `parquetSchema` according to `catalystSchema` by removing column paths don't exist + * in `catalystSchema`, and adding those only exist in `catalystSchema`. + */ + def clipParquetSchema(parquetSchema: MessageType, catalystSchema: StructType): MessageType = { + val clippedParquetFields = clipParquetGroupFields(parquetSchema.asGroupType(), catalystSchema) + Types.buildMessage().addFields(clippedParquetFields: _*).named("root") + } + + private def clipParquetType(parquetType: Type, catalystType: DataType): Type = { + catalystType match { + case t: ArrayType if !isPrimitiveCatalystType(t.elementType) => + // Only clips array types with nested type as element type. + clipParquetListType(parquetType.asGroupType(), t.elementType) + + case t: MapType + if !isPrimitiveCatalystType(t.keyType) || + !isPrimitiveCatalystType(t.valueType) => + // Only clips map types with nested key type or value type + clipParquetMapType(parquetType.asGroupType(), t.keyType, t.valueType) + + case t: StructType => + clipParquetGroup(parquetType.asGroupType(), t) + + case _ => + // UDTs and primitive types are not clipped. For UDTs, a clipped version might not be able + // to be mapped to desired user-space types. So UDTs shouldn't participate schema merging. + parquetType + } + } + + /** + * Whether a Catalyst [[DataType]] is primitive. Primitive [[DataType]] is not equivalent to + * [[AtomicType]]. For example, [[CalendarIntervalType]] is primitive, but it's not an + * [[AtomicType]]. + */ + private def isPrimitiveCatalystType(dataType: DataType): Boolean = { + dataType match { + case _: ArrayType | _: MapType | _: StructType => false + case _ => true + } + } + + /** + * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[ArrayType]]. The element type + * of the [[ArrayType]] should also be a nested type, namely an [[ArrayType]], a [[MapType]], or a + * [[StructType]]. + */ + private def clipParquetListType(parquetList: GroupType, elementType: DataType): Type = { + // Precondition of this method, should only be called for lists with nested element types. + assert(!isPrimitiveCatalystType(elementType)) + + // Unannotated repeated group should be interpreted as required list of required element, so + // list element type is just the group itself. Clip it. + if (parquetList.getOriginalType == null && parquetList.isRepetition(Repetition.REPEATED)) { + clipParquetType(parquetList, elementType) + } else { + assert( + parquetList.getOriginalType == OriginalType.LIST, + "Invalid Parquet schema. " + + "Original type of annotated Parquet lists must be LIST: " + + parquetList.toString) + + assert( + parquetList.getFieldCount == 1 && parquetList.getType(0).isRepetition(Repetition.REPEATED), + "Invalid Parquet schema. " + + "LIST-annotated group should only have exactly one repeated field: " + + parquetList) + + // Precondition of this method, should only be called for lists with nested element types. + assert(!parquetList.getType(0).isPrimitive) + + val repeatedGroup = parquetList.getType(0).asGroupType() + + // If the repeated field is a group with multiple fields, or the repeated field is a group + // with one field and is named either "array" or uses the LIST-annotated group's name with + // "_tuple" appended then the repeated type is the element type and elements are required. + // Build a new LIST-annotated group with clipped `repeatedGroup` as element type and the + // only field. + if ( + repeatedGroup.getFieldCount > 1 || + repeatedGroup.getName == "array" || + repeatedGroup.getName == parquetList.getName + "_tuple" + ) { + Types + .buildGroup(parquetList.getRepetition) + .as(OriginalType.LIST) + .addField(clipParquetType(repeatedGroup, elementType)) + .named(parquetList.getName) + } else { + // Otherwise, the repeated field's type is the element type with the repeated field's + // repetition. + Types + .buildGroup(parquetList.getRepetition) + .as(OriginalType.LIST) + .addField( + Types + .repeatedGroup() + .addField(clipParquetType(repeatedGroup.getType(0), elementType)) + .named(repeatedGroup.getName)) + .named(parquetList.getName) + } + } + } + + /** + * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[MapType]]. Either key type or + * value type of the [[MapType]] must be a nested type, namely an [[ArrayType]], a [[MapType]], or + * a [[StructType]]. + */ + private def clipParquetMapType( + parquetMap: GroupType, keyType: DataType, valueType: DataType): GroupType = { + // Precondition of this method, only handles maps with nested key types or value types. + assert(!isPrimitiveCatalystType(keyType) || !isPrimitiveCatalystType(valueType)) + + val repeatedGroup = parquetMap.getType(0).asGroupType() + val parquetKeyType = repeatedGroup.getType(0) + val parquetValueType = repeatedGroup.getType(1) + + val clippedRepeatedGroup = + Types + .repeatedGroup() + .as(repeatedGroup.getOriginalType) + .addField(clipParquetType(parquetKeyType, keyType)) + .addField(clipParquetType(parquetValueType, valueType)) + .named(repeatedGroup.getName) + + Types + .buildGroup(parquetMap.getRepetition) + .as(parquetMap.getOriginalType) + .addField(clippedRepeatedGroup) + .named(parquetMap.getName) + } + + /** + * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[StructType]]. + * + * @return A clipped [[GroupType]], which has at least one field. + * @note Parquet doesn't allow creating empty [[GroupType]] instances except for empty + * [[MessageType]]. Because it's legal to construct an empty requested schema for column + * pruning. + */ + private def clipParquetGroup(parquetRecord: GroupType, structType: StructType): GroupType = { + val clippedParquetFields = clipParquetGroupFields(parquetRecord, structType) + Types + .buildGroup(parquetRecord.getRepetition) + .as(parquetRecord.getOriginalType) + .addFields(clippedParquetFields: _*) + .named(parquetRecord.getName) + } + + /** + * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[StructType]]. + * + * @return A list of clipped [[GroupType]] fields, which can be empty. + */ + private def clipParquetGroupFields( + parquetRecord: GroupType, structType: StructType): Seq[Type] = { + val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap + val toParquet = new CatalystSchemaConverter(followParquetFormatSpec = true) + structType.map { f => + parquetFieldMap + .get(f.name) + .map(clipParquetType(_, f.dataType)) + .getOrElse(toParquet.convertField(f)) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRecordMaterializer.scala similarity index 96% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRecordMaterializer.scala index 84f1dccfeb78..ed9e0aa65977 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRecordMaterializer.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import org.apache.parquet.io.api.{GroupConverter, RecordMaterializer} import org.apache.parquet.schema.MessageType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala similarity index 66% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala index 6938b071065c..2ff2fda3610b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala @@ -15,20 +15,22 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import java.math.{BigDecimal, BigInteger} import java.nio.ByteOrder -import scala.collection.JavaConversions._ -import scala.collection.mutable +import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import org.apache.parquet.column.Dictionary import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter} +import org.apache.parquet.schema.OriginalType.{INT_32, LIST, UTF8} +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE import org.apache.parquet.schema.Type.Repetition -import org.apache.parquet.schema.{GroupType, PrimitiveType, Type} +import org.apache.parquet.schema.{GroupType, MessageType, PrimitiveType, Type} +import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.DateTimeUtils @@ -42,6 +44,12 @@ import org.apache.spark.unsafe.types.UTF8String * values to an [[ArrayBuffer]]. */ private[parquet] trait ParentContainerUpdater { + /** Called before a record field is being converted */ + def start(): Unit = () + + /** Called after a record field is being converted */ + def end(): Unit = () + def set(value: Any): Unit = () def setBoolean(value: Boolean): Unit = set(value) def setByte(value: Byte): Unit = set(value) @@ -55,9 +63,52 @@ private[parquet] trait ParentContainerUpdater { /** A no-op updater used for root converter (who doesn't have a parent). */ private[parquet] object NoopUpdater extends ParentContainerUpdater +private[parquet] trait HasParentContainerUpdater { + def updater: ParentContainerUpdater +} + +/** + * A convenient converter class for Parquet group types with an [[HasParentContainerUpdater]]. + */ +private[parquet] abstract class CatalystGroupConverter(val updater: ParentContainerUpdater) + extends GroupConverter with HasParentContainerUpdater + +/** + * Parquet converter for Parquet primitive types. Note that not all Spark SQL atomic types + * are handled by this converter. Parquet primitive types are only a subset of those of Spark + * SQL. For example, BYTE, SHORT, and INT in Spark SQL are all covered by INT32 in Parquet. + */ +private[parquet] class CatalystPrimitiveConverter(val updater: ParentContainerUpdater) + extends PrimitiveConverter with HasParentContainerUpdater { + + override def addBoolean(value: Boolean): Unit = updater.setBoolean(value) + override def addInt(value: Int): Unit = updater.setInt(value) + override def addLong(value: Long): Unit = updater.setLong(value) + override def addFloat(value: Float): Unit = updater.setFloat(value) + override def addDouble(value: Double): Unit = updater.setDouble(value) + override def addBinary(value: Binary): Unit = updater.set(value.getBytes) +} + /** - * A [[CatalystRowConverter]] is used to convert Parquet "structs" into Spark SQL [[InternalRow]]s. - * Since any Parquet record is also a struct, this converter can also be used as root converter. + * A [[CatalystRowConverter]] is used to convert Parquet records into Catalyst [[InternalRow]]s. + * Since Catalyst `StructType` is also a Parquet record, this converter can be used as root + * converter. Take the following Parquet type as an example: + * {{{ + * message root { + * required int32 f1; + * optional group f2 { + * required double f21; + * optional binary f22 (utf8); + * } + * } + * }}} + * 5 converters will be created: + * + * - a root [[CatalystRowConverter]] for [[MessageType]] `root`, which contains: + * - a [[CatalystPrimitiveConverter]] for required [[INT_32]] field `f1`, and + * - a nested [[CatalystRowConverter]] for optional [[GroupType]] `f2`, which contains: + * - a [[CatalystPrimitiveConverter]] for required [[DOUBLE]] field `f21`, and + * - a [[CatalystStringConverter]] for optional [[UTF8]] string field `f22` * * When used as a root converter, [[NoopUpdater]] should be used since root converters don't have * any "parent" container. @@ -70,7 +121,26 @@ private[parquet] class CatalystRowConverter( parquetType: GroupType, catalystType: StructType, updater: ParentContainerUpdater) - extends GroupConverter { + extends CatalystGroupConverter(updater) with Logging { + + assert( + parquetType.getFieldCount == catalystType.length, + s"""Field counts of the Parquet schema and the Catalyst schema don't match: + | + |Parquet schema: + |$parquetType + |Catalyst schema: + |${catalystType.prettyJson} + """.stripMargin) + + logDebug( + s"""Building row converter for the following schema: + | + |Parquet form: + |$parquetType + |Catalyst form: + |${catalystType.prettyJson} + """.stripMargin) /** * Updater used together with field converters within a [[CatalystRowConverter]]. It propagates @@ -89,14 +159,12 @@ private[parquet] class CatalystRowConverter( /** * Represents the converted row object once an entire Parquet record is converted. - * - * @todo Uses [[UnsafeRow]] for better performance. */ val currentRow = new SpecificMutableRow(catalystType.map(_.dataType)) // Converters for each field. - private val fieldConverters: Array[Converter] = { - parquetType.getFields.zip(catalystType).zipWithIndex.map { + private val fieldConverters: Array[Converter with HasParentContainerUpdater] = { + parquetType.getFields.asScala.zip(catalystType).zipWithIndex.map { case ((parquetFieldType, catalystField), ordinal) => // Converted field value should be set to the `ordinal`-th cell of `currentRow` newConverter(parquetFieldType, catalystField.dataType, new RowUpdater(currentRow, ordinal)) @@ -105,11 +173,19 @@ private[parquet] class CatalystRowConverter( override def getConverter(fieldIndex: Int): Converter = fieldConverters(fieldIndex) - override def end(): Unit = updater.set(currentRow) + override def end(): Unit = { + var i = 0 + while (i < currentRow.numFields) { + fieldConverters(i).updater.end() + i += 1 + } + updater.set(currentRow) + } override def start(): Unit = { var i = 0 while (i < currentRow.numFields) { + fieldConverters(i).updater.start() currentRow.setNullAt(i) i += 1 } @@ -122,20 +198,20 @@ private[parquet] class CatalystRowConverter( private def newConverter( parquetType: Type, catalystType: DataType, - updater: ParentContainerUpdater): Converter = { + updater: ParentContainerUpdater): Converter with HasParentContainerUpdater = { catalystType match { case BooleanType | IntegerType | LongType | FloatType | DoubleType | BinaryType => new CatalystPrimitiveConverter(updater) case ByteType => - new PrimitiveConverter { + new CatalystPrimitiveConverter(updater) { override def addInt(value: Int): Unit = updater.setByte(value.asInstanceOf[ByteType#InternalType]) } case ShortType => - new PrimitiveConverter { + new CatalystPrimitiveConverter(updater) { override def addInt(value: Int): Unit = updater.setShort(value.asInstanceOf[ShortType#InternalType]) } @@ -148,7 +224,7 @@ private[parquet] class CatalystRowConverter( case TimestampType => // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that. - new PrimitiveConverter { + new CatalystPrimitiveConverter(updater) { // Converts nanosecond timestamps stored as INT96 override def addBinary(value: Binary): Unit = { assert( @@ -164,13 +240,23 @@ private[parquet] class CatalystRowConverter( } case DateType => - new PrimitiveConverter { + new CatalystPrimitiveConverter(updater) { override def addInt(value: Int): Unit = { // DateType is not specialized in `SpecificMutableRow`, have to box it here. updater.set(value.asInstanceOf[DateType#InternalType]) } } + // A repeated field that is neither contained by a `LIST`- or `MAP`-annotated group nor + // annotated by `LIST` or `MAP` should be interpreted as a required list of required + // elements where the element type is the type of the field. + case t: ArrayType if parquetType.getOriginalType != LIST => + if (parquetType.isPrimitive) { + new RepeatedPrimitiveConverter(parquetType, t.elementType, updater) + } else { + new RepeatedGroupConverter(parquetType, t.elementType, updater) + } + case t: ArrayType => new CatalystArrayConverter(parquetType.asGroupType(), t, updater) @@ -195,27 +281,11 @@ private[parquet] class CatalystRowConverter( } } - /** - * Parquet converter for Parquet primitive types. Note that not all Spark SQL atomic types - * are handled by this converter. Parquet primitive types are only a subset of those of Spark - * SQL. For example, BYTE, SHORT, and INT in Spark SQL are all covered by INT32 in Parquet. - */ - private final class CatalystPrimitiveConverter(updater: ParentContainerUpdater) - extends PrimitiveConverter { - - override def addBoolean(value: Boolean): Unit = updater.setBoolean(value) - override def addInt(value: Int): Unit = updater.setInt(value) - override def addLong(value: Long): Unit = updater.setLong(value) - override def addFloat(value: Float): Unit = updater.setFloat(value) - override def addDouble(value: Double): Unit = updater.setDouble(value) - override def addBinary(value: Binary): Unit = updater.set(value.getBytes) - } - /** * Parquet converter for strings. A dictionary is used to minimize string decoding cost. */ private final class CatalystStringConverter(updater: ParentContainerUpdater) - extends PrimitiveConverter { + extends CatalystPrimitiveConverter(updater) { private var expandedDictionary: Array[UTF8String] = null @@ -242,7 +312,7 @@ private[parquet] class CatalystRowConverter( private final class CatalystDecimalConverter( decimalType: DecimalType, updater: ParentContainerUpdater) - extends PrimitiveConverter { + extends CatalystPrimitiveConverter(updater) { // Converts decimals stored as INT32 override def addInt(value: Int): Unit = { @@ -264,7 +334,7 @@ private[parquet] class CatalystRowConverter( val scale = decimalType.scale val bytes = value.getBytes - if (precision <= 8) { + if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) { // Constructs a `Decimal` with an unscaled `Long` value if possible. var unscaled = 0L var i = 0 @@ -306,15 +376,16 @@ private[parquet] class CatalystRowConverter( parquetSchema: GroupType, catalystSchema: ArrayType, updater: ParentContainerUpdater) - extends GroupConverter { + extends CatalystGroupConverter(updater) { private var currentArray: ArrayBuffer[Any] = _ private val elementConverter: Converter = { val repeatedType = parquetSchema.getType(0) val elementType = catalystSchema.elementType + val parentName = parquetSchema.getName - if (isElementType(repeatedType, elementType)) { + if (isElementType(repeatedType, elementType, parentName)) { newConverter(repeatedType, elementType, new ParentContainerUpdater { override def set(value: Any): Unit = currentArray += value }) @@ -351,10 +422,13 @@ private[parquet] class CatalystRowConverter( * @see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules */ // scalastyle:on - private def isElementType(parquetRepeatedType: Type, catalystElementType: DataType): Boolean = { + private def isElementType( + parquetRepeatedType: Type, catalystElementType: DataType, parentName: String): Boolean = { (parquetRepeatedType, catalystElementType) match { case (t: PrimitiveType, _) => true case (t: GroupType, _) if t.getFieldCount > 1 => true + case (t: GroupType, _) if t.getFieldCount == 1 && t.getName == "array" => true + case (t: GroupType, _) if t.getFieldCount == 1 && t.getName == parentName + "_tuple" => true case (t: GroupType, StructType(Array(f))) if f.name == t.getFieldName(0) => true case _ => false } @@ -383,7 +457,7 @@ private[parquet] class CatalystRowConverter( parquetType: GroupType, catalystType: MapType, updater: ParentContainerUpdater) - extends GroupConverter { + extends CatalystGroupConverter(updater) { private var currentKeys: ArrayBuffer[Any] = _ private var currentValues: ArrayBuffer[Any] = _ @@ -446,4 +520,61 @@ private[parquet] class CatalystRowConverter( } } } + + private trait RepeatedConverter { + private var currentArray: ArrayBuffer[Any] = _ + + protected def newArrayUpdater(updater: ParentContainerUpdater) = new ParentContainerUpdater { + override def start(): Unit = currentArray = ArrayBuffer.empty[Any] + override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) + override def set(value: Any): Unit = currentArray += value + } + } + + /** + * A primitive converter for converting unannotated repeated primitive values to required arrays + * of required primitives values. + */ + private final class RepeatedPrimitiveConverter( + parquetType: Type, + catalystType: DataType, + parentUpdater: ParentContainerUpdater) + extends PrimitiveConverter with RepeatedConverter with HasParentContainerUpdater { + + val updater: ParentContainerUpdater = newArrayUpdater(parentUpdater) + + private val elementConverter: PrimitiveConverter = + newConverter(parquetType, catalystType, updater).asPrimitiveConverter() + + override def addBoolean(value: Boolean): Unit = elementConverter.addBoolean(value) + override def addInt(value: Int): Unit = elementConverter.addInt(value) + override def addLong(value: Long): Unit = elementConverter.addLong(value) + override def addFloat(value: Float): Unit = elementConverter.addFloat(value) + override def addDouble(value: Double): Unit = elementConverter.addDouble(value) + override def addBinary(value: Binary): Unit = elementConverter.addBinary(value) + + override def setDictionary(dict: Dictionary): Unit = elementConverter.setDictionary(dict) + override def hasDictionarySupport: Boolean = elementConverter.hasDictionarySupport + override def addValueFromDictionary(id: Int): Unit = elementConverter.addValueFromDictionary(id) + } + + /** + * A group converter for converting unannotated repeated group values to required arrays of + * required struct values. + */ + private final class RepeatedGroupConverter( + parquetType: Type, + catalystType: DataType, + parentUpdater: ParentContainerUpdater) + extends GroupConverter with HasParentContainerUpdater with RepeatedConverter { + + val updater: ParentContainerUpdater = newArrayUpdater(parentUpdater) + + private val elementConverter: GroupConverter = + newConverter(parquetType, catalystType, updater).asGroupConverter() + + override def getConverter(field: Int): Converter = elementConverter.getConverter(field) + override def end(): Unit = elementConverter.end() + override def start(): Unit = elementConverter.start() + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala similarity index 90% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala index d43ca95b4eea..9c539130d422 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConversions._ @@ -25,6 +25,7 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.parquet.schema.Type.Repetition._ import org.apache.parquet.schema._ +import org.apache.spark.sql.execution.datasources.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, maxPrecisionForBytes} import org.apache.spark.sql.types._ import org.apache.spark.sql.{AnalysisException, SQLConf} @@ -54,16 +55,10 @@ import org.apache.spark.sql.{AnalysisException, SQLConf} * to old style non-standard behaviors. */ private[parquet] class CatalystSchemaConverter( - private val assumeBinaryIsString: Boolean, - private val assumeInt96IsTimestamp: Boolean, - private val followParquetFormatSpec: Boolean) { - - // Only used when constructing converter for converting Spark SQL schema to Parquet schema, in - // which case `assumeInt96IsTimestamp` and `assumeBinaryIsString` are irrelevant. - def this() = this( - assumeBinaryIsString = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get, - assumeInt96IsTimestamp = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get, - followParquetFormatSpec = SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get) + assumeBinaryIsString: Boolean = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get, + assumeInt96IsTimestamp: Boolean = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get, + followParquetFormatSpec: Boolean = SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get +) { def this(conf: SQLConf) = this( assumeBinaryIsString = conf.isParquetBinaryAsString, @@ -71,18 +66,9 @@ private[parquet] class CatalystSchemaConverter( followParquetFormatSpec = conf.followParquetFormatSpec) def this(conf: Configuration) = this( - assumeBinaryIsString = - conf.getBoolean( - SQLConf.PARQUET_BINARY_AS_STRING.key, - SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get), - assumeInt96IsTimestamp = - conf.getBoolean( - SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, - SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get), - followParquetFormatSpec = - conf.getBoolean( - SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, - SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get)) + assumeBinaryIsString = conf.get(SQLConf.PARQUET_BINARY_AS_STRING.key).toBoolean, + assumeInt96IsTimestamp = conf.get(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key).toBoolean, + followParquetFormatSpec = conf.get(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key).toBoolean) /** * Converts Parquet [[MessageType]] `parquetSchema` to a Spark SQL [[StructType]]. @@ -99,8 +85,11 @@ private[parquet] class CatalystSchemaConverter( StructField(field.getName, convertField(field), nullable = false) case REPEATED => - throw new AnalysisException( - s"REPEATED not supported outside LIST or MAP. Type: $field") + // A repeated field that is neither contained by a `LIST`- or `MAP`-annotated group nor + // annotated by `LIST` or `MAP` should be interpreted as a required list of required + // elements where the element type is the type of the field. + val arrayType = ArrayType(convertField(field), containsNull = false) + StructField(field.getName, arrayType, nullable = false) } } @@ -155,7 +144,7 @@ private[parquet] class CatalystSchemaConverter( case INT_16 => ShortType case INT_32 | null => IntegerType case DATE => DateType - case DECIMAL => makeDecimalType(maxPrecisionForBytes(4)) + case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT32) case TIME_MILLIS => typeNotImplemented() case _ => illegalType() } @@ -163,7 +152,7 @@ private[parquet] class CatalystSchemaConverter( case INT64 => originalType match { case INT_64 | null => LongType - case DECIMAL => makeDecimalType(maxPrecisionForBytes(8)) + case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT64) case TIMESTAMP_MILLIS => typeNotImplemented() case _ => illegalType() } @@ -405,7 +394,7 @@ private[parquet] class CatalystSchemaConverter( // Uses INT32 for 1 <= precision <= 9 case DecimalType.Fixed(precision, scale) - if precision <= maxPrecisionForBytes(4) && followParquetFormatSpec => + if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec => Types .primitive(INT32, repetition) .as(DECIMAL) @@ -415,7 +404,7 @@ private[parquet] class CatalystSchemaConverter( // Uses INT64 for 1 <= precision <= 18 case DecimalType.Fixed(precision, scale) - if precision <= maxPrecisionForBytes(8) && followParquetFormatSpec => + if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec => Types .primitive(INT64, repetition) .as(DECIMAL) @@ -437,13 +426,14 @@ private[parquet] class CatalystSchemaConverter( // ArrayType and MapType (for Spark versions <= 1.4.x) // =================================================== - // Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level - // LIST structure. This behavior mimics parquet-hive (1.6.0rc3). Note that this case is - // covered by the backwards-compatibility rules implemented in `isElementType()`. + // Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level + // `LIST` structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro + // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element + // field name "array" is borrowed from parquet-avro. case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec => // group (LIST) { // optional group bag { - // repeated element; + // repeated array; // } // } ConversionPatterns.listType( @@ -452,8 +442,8 @@ private[parquet] class CatalystSchemaConverter( Types .buildGroup(REPEATED) // "array_element" is the name chosen by parquet-hive (1.7.0 and prior version) - .addField(convertField(StructField("array_element", elementType, nullable))) - .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME)) + .addField(convertField(StructField("array", elementType, nullable))) + .named("bag")) // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level // LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is @@ -534,14 +524,6 @@ private[parquet] class CatalystSchemaConverter( throw new AnalysisException(s"Unsupported data type $field.dataType") } } - - // Max precision of a decimal value stored in `numBytes` bytes - private def maxPrecisionForBytes(numBytes: Int): Int = { - Math.round( // convert double to long - Math.floor(Math.log10( // number of base-10 digits - Math.pow(2, 8 * numBytes - 1) - 1))) // max value stored in numBytes - .asInstanceOf[Int] - } } @@ -584,4 +566,16 @@ private[parquet] object CatalystSchemaConverter { computeMinBytesForPrecision(precision) } } + + val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4) + + val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8) + + // Max precision of a decimal value stored in `numBytes` bytes + def maxPrecisionForBytes(numBytes: Int): Int = { + Math.round( // convert double to long + Math.floor(Math.log10( // number of base-10 digits + Math.pow(2, 8 * numBytes - 1) - 1))) // max value stored in numBytes + .asInstanceOf[Int] + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala similarity index 98% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala index 1551afd7b7bf..2c6b914328b6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala similarity index 96% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala index 6ed3580af072..ccd7ebf319af 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.{MapData, ArrayData} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala similarity index 64% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index d57b789f5c1c..c6b3fe7900da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -15,35 +15,27 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import java.io.Serializable import java.nio.ByteBuffer import com.google.common.io.BaseEncoding import org.apache.hadoop.conf.Configuration -import org.apache.parquet.filter2.compat.FilterCompat -import org.apache.parquet.filter2.compat.FilterCompat._ import org.apache.parquet.filter2.predicate.FilterApi._ -import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Statistics} -import org.apache.parquet.filter2.predicate.UserDefinedPredicate +import org.apache.parquet.filter2.predicate._ import org.apache.parquet.io.api.Binary +import org.apache.parquet.schema.OriginalType +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName import org.apache.spark.SparkEnv import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.sources import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String private[sql] object ParquetFilters { val PARQUET_FILTER_DATA = "org.apache.spark.sql.parquet.row.filter" - def createRecordFilter(filterExpressions: Seq[Expression]): Option[Filter] = { - filterExpressions.flatMap { filter => - createFilter(filter) - }.reduceOption(FilterApi.and).map(FilterCompat.get) - } - case class SetInFilter[T <: Comparable[T]]( valueSet: Set[T]) extends UserDefinedPredicate[T] with Serializable { @@ -72,7 +64,7 @@ private[sql] object ParquetFilters { case StringType => (n: String, v: Any) => FilterApi.eq( binaryColumn(n), - Option(v).map(s => Binary.fromByteArray(s.asInstanceOf[UTF8String].getBytes)).orNull) + Option(v).map(s => Binary.fromByteArray(s.asInstanceOf[String].getBytes("utf-8"))).orNull) case BinaryType => (n: String, v: Any) => FilterApi.eq( binaryColumn(n), @@ -93,7 +85,7 @@ private[sql] object ParquetFilters { case StringType => (n: String, v: Any) => FilterApi.notEq( binaryColumn(n), - Option(v).map(s => Binary.fromByteArray(s.asInstanceOf[UTF8String].getBytes)).orNull) + Option(v).map(s => Binary.fromByteArray(s.asInstanceOf[String].getBytes("utf-8"))).orNull) case BinaryType => (n: String, v: Any) => FilterApi.notEq( binaryColumn(n), @@ -111,7 +103,8 @@ private[sql] object ParquetFilters { (n: String, v: Any) => FilterApi.lt(doubleColumn(n), v.asInstanceOf[java.lang.Double]) case StringType => (n: String, v: Any) => - FilterApi.lt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[UTF8String].getBytes)) + FilterApi.lt(binaryColumn(n), + Binary.fromByteArray(v.asInstanceOf[String].getBytes("utf-8"))) case BinaryType => (n: String, v: Any) => FilterApi.lt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]])) @@ -128,7 +121,8 @@ private[sql] object ParquetFilters { (n: String, v: Any) => FilterApi.ltEq(doubleColumn(n), v.asInstanceOf[java.lang.Double]) case StringType => (n: String, v: Any) => - FilterApi.ltEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[UTF8String].getBytes)) + FilterApi.ltEq(binaryColumn(n), + Binary.fromByteArray(v.asInstanceOf[String].getBytes("utf-8"))) case BinaryType => (n: String, v: Any) => FilterApi.ltEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]])) @@ -145,7 +139,8 @@ private[sql] object ParquetFilters { (n: String, v: Any) => FilterApi.gt(doubleColumn(n), v.asInstanceOf[java.lang.Double]) case StringType => (n: String, v: Any) => - FilterApi.gt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[UTF8String].getBytes)) + FilterApi.gt(binaryColumn(n), + Binary.fromByteArray(v.asInstanceOf[String].getBytes("utf-8"))) case BinaryType => (n: String, v: Any) => FilterApi.gt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]])) @@ -162,7 +157,8 @@ private[sql] object ParquetFilters { (n: String, v: Any) => FilterApi.gtEq(doubleColumn(n), v.asInstanceOf[java.lang.Double]) case StringType => (n: String, v: Any) => - FilterApi.gtEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[UTF8String].getBytes)) + FilterApi.gtEq(binaryColumn(n), + Binary.fromByteArray(v.asInstanceOf[String].getBytes("utf-8"))) case BinaryType => (n: String, v: Any) => FilterApi.gtEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]])) @@ -184,7 +180,7 @@ private[sql] object ParquetFilters { case StringType => (n: String, v: Set[Any]) => FilterApi.userDefined(binaryColumn(n), - SetInFilter(v.map(e => Binary.fromByteArray(e.asInstanceOf[UTF8String].getBytes)))) + SetInFilter(v.map(s => Binary.fromByteArray(s.asInstanceOf[String].getBytes("utf-8"))))) case BinaryType => (n: String, v: Set[Any]) => FilterApi.userDefined(binaryColumn(n), @@ -197,11 +193,23 @@ private[sql] object ParquetFilters { def createFilter(schema: StructType, predicate: sources.Filter): Option[FilterPredicate] = { val dataTypeOf = schema.map(f => f.name -> f.dataType).toMap + relaxParquetValidTypeMap + // NOTE: // // For any comparison operator `cmp`, both `a cmp NULL` and `NULL cmp a` evaluate to `NULL`, // which can be casted to `false` implicitly. Please refer to the `eval` method of these // operators and the `SimplifyFilters` rule for details. + + // Hyukjin: + // I added [[EqualNullSafe]] with [[org.apache.parquet.filter2.predicate.Operators.Eq]]. + // So, it performs equality comparison identically when given [[sources.Filter]] is [[EqualTo]]. + // The reason why I did this is, that the actual Parquet filter checks null-safe equality + // comparison. + // So I added this and maybe [[EqualTo]] should be changed. It still seems fine though, because + // physical planning does not set `NULL` to [[EqualTo]] but changes it to [[IsNull]] and etc. + // Probably I missed something and obviously this should be changed. + predicate match { case sources.IsNull(name) => makeEq.lift(dataTypeOf(name)).map(_(name, null)) @@ -213,6 +221,11 @@ private[sql] object ParquetFilters { case sources.Not(sources.EqualTo(name, value)) => makeNotEq.lift(dataTypeOf(name)).map(_(name, value)) + case sources.EqualNullSafe(name, value) => + makeEq.lift(dataTypeOf(name)).map(_(name, value)) + case sources.Not(sources.EqualNullSafe(name, value)) => + makeNotEq.lift(dataTypeOf(name)).map(_(name, value)) + case sources.LessThan(name, value) => makeLt.lift(dataTypeOf(name)).map(_(name, value)) case sources.LessThanOrEqual(name, value) => @@ -239,94 +252,35 @@ private[sql] object ParquetFilters { } } - /** - * Converts Catalyst predicate expressions to Parquet filter predicates. - * - * @todo This can be removed once we get rid of the old Parquet support. - */ - def createFilter(predicate: Expression): Option[FilterPredicate] = { - // NOTE: - // - // For any comparison operator `cmp`, both `a cmp NULL` and `NULL cmp a` evaluate to `NULL`, - // which can be casted to `false` implicitly. Please refer to the `eval` method of these - // operators and the `SimplifyFilters` rule for details. - predicate match { - case IsNull(NamedExpression(name, dataType)) => - makeEq.lift(dataType).map(_(name, null)) - case IsNotNull(NamedExpression(name, dataType)) => - makeNotEq.lift(dataType).map(_(name, null)) - - case EqualTo(NamedExpression(name, _), NonNullLiteral(value, dataType)) => - makeEq.lift(dataType).map(_(name, value)) - case EqualTo(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) => - makeEq.lift(dataType).map(_(name, value)) - case EqualTo(NonNullLiteral(value, dataType), NamedExpression(name, _)) => - makeEq.lift(dataType).map(_(name, value)) - case EqualTo(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) => - makeEq.lift(dataType).map(_(name, value)) - - case Not(EqualTo(NamedExpression(name, _), NonNullLiteral(value, dataType))) => - makeNotEq.lift(dataType).map(_(name, value)) - case Not(EqualTo(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _))) => - makeNotEq.lift(dataType).map(_(name, value)) - case Not(EqualTo(NonNullLiteral(value, dataType), NamedExpression(name, _))) => - makeNotEq.lift(dataType).map(_(name, value)) - case Not(EqualTo(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType))) => - makeNotEq.lift(dataType).map(_(name, value)) - - case LessThan(NamedExpression(name, _), NonNullLiteral(value, dataType)) => - makeLt.lift(dataType).map(_(name, value)) - case LessThan(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) => - makeLt.lift(dataType).map(_(name, value)) - case LessThan(NonNullLiteral(value, dataType), NamedExpression(name, _)) => - makeGt.lift(dataType).map(_(name, value)) - case LessThan(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) => - makeGt.lift(dataType).map(_(name, value)) - - case LessThanOrEqual(NamedExpression(name, _), NonNullLiteral(value, dataType)) => - makeLtEq.lift(dataType).map(_(name, value)) - case LessThanOrEqual(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) => - makeLtEq.lift(dataType).map(_(name, value)) - case LessThanOrEqual(NonNullLiteral(value, dataType), NamedExpression(name, _)) => - makeGtEq.lift(dataType).map(_(name, value)) - case LessThanOrEqual(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) => - makeGtEq.lift(dataType).map(_(name, value)) - - case GreaterThan(NamedExpression(name, _), NonNullLiteral(value, dataType)) => - makeGt.lift(dataType).map(_(name, value)) - case GreaterThan(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) => - makeGt.lift(dataType).map(_(name, value)) - case GreaterThan(NonNullLiteral(value, dataType), NamedExpression(name, _)) => - makeLt.lift(dataType).map(_(name, value)) - case GreaterThan(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) => - makeLt.lift(dataType).map(_(name, value)) - - case GreaterThanOrEqual(NamedExpression(name, _), NonNullLiteral(value, dataType)) => - makeGtEq.lift(dataType).map(_(name, value)) - case GreaterThanOrEqual(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) => - makeGtEq.lift(dataType).map(_(name, value)) - case GreaterThanOrEqual(NonNullLiteral(value, dataType), NamedExpression(name, _)) => - makeLtEq.lift(dataType).map(_(name, value)) - case GreaterThanOrEqual(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) => - makeLtEq.lift(dataType).map(_(name, value)) - - case And(lhs, rhs) => - (createFilter(lhs) ++ createFilter(rhs)).reduceOption(FilterApi.and) - - case Or(lhs, rhs) => - for { - lhsFilter <- createFilter(lhs) - rhsFilter <- createFilter(rhs) - } yield FilterApi.or(lhsFilter, rhsFilter) - - case Not(pred) => - createFilter(pred).map(FilterApi.not) - - case InSet(NamedExpression(name, dataType), valueSet) => - makeInSet.lift(dataType).map(_(name, valueSet)) - - case _ => None - } + // !! HACK ALERT !! + // + // This lazy val is a workaround for PARQUET-201, and should be removed once we upgrade to + // parquet-mr 1.8.1 or higher versions. + // + // In Parquet, not all types of columns can be used for filter push-down optimization. The set + // of valid column types is controlled by `ValidTypeMap`. Unfortunately, in parquet-mr 1.7.0 and + // prior versions, the limitation is too strict, and doesn't allow `BINARY (ENUM)` columns to be + // pushed down. + // + // This restriction is problematic for Spark SQL, because Spark SQL doesn't have a type that maps + // to Parquet original type `ENUM` directly, and always converts `ENUM` to `StringType`. Thus, + // a predicate involving a `ENUM` field can be pushed-down as a string column, which is perfectly + // legal except that it fails the `ValidTypeMap` check. + // + // Here we add `BINARY (ENUM)` into `ValidTypeMap` lazily via reflection to workaround this issue. + private lazy val relaxParquetValidTypeMap: Unit = { + val constructor = Class + .forName(classOf[ValidTypeMap].getCanonicalName + "$FullTypeDescriptor") + .getDeclaredConstructor(classOf[PrimitiveTypeName], classOf[OriginalType]) + + constructor.setAccessible(true) + val enumTypeDescriptor = constructor + .newInstance(PrimitiveTypeName.BINARY, OriginalType.ENUM) + .asInstanceOf[AnyRef] + + val addMethod = classOf[ValidTypeMap].getDeclaredMethods.find(_.getName == "add").get + addMethod.setAccessible(true) + addMethod.invoke(null, classOf[Binary], enumTypeDescriptor) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala similarity index 86% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala index b4337a48dbd8..a73dccd7de22 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala @@ -15,10 +15,10 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import java.net.URI -import java.util.logging.{Level, Logger => JLogger} +import java.util.logging.{Logger => JLogger} import java.util.{List => JList} import scala.collection.JavaConversions._ @@ -26,30 +26,35 @@ import scala.collection.mutable import scala.util.{Failure, Try} import com.google.common.base.Objects +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.parquet.filter2.predicate.FilterApi +import org.apache.parquet.hadoop._ import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.parquet.hadoop.util.ContextUtil -import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetRecordReader, _} import org.apache.parquet.schema.MessageType -import org.apache.parquet.{Log => ParquetLog} +import org.apache.parquet.{Log => ApacheParquetLog} +import org.slf4j.bridge.SLF4JBridgeHandler -import org.apache.spark.{Logging, Partition => SparkPartition, SparkException} import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.{SqlNewHadoopPartition, SqlNewHadoopRDD, RDD} -import org.apache.spark.rdd.RDD._ +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.rdd.{RDD, SqlNewHadoopPartition, SqlNewHadoopRDD} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionSpec import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.util.{SerializableConfiguration, Utils} +import org.apache.spark.{Logging, Partition => SparkPartition, SparkException} + +private[sql] class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister { + + override def shortName(): String = "parquet" -private[sql] class DefaultSource extends HadoopFsRelationProvider { override def createRelation( sqlContext: SQLContext, paths: Array[String], @@ -62,7 +67,7 @@ private[sql] class DefaultSource extends HadoopFsRelationProvider { // NOTE: This class is instantiated and used on executor side only, no need to be serializable. private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext) - extends OutputWriterInternal { + extends OutputWriter { private val recordWriter: RecordWriter[Void, InternalRow] = { val outputFormat = { @@ -77,8 +82,10 @@ private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext // `FileOutputCommitter.getWorkPath()`, which points to the base directory of all // partitions in the case of dynamic partitioning. override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { - val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") - val split = context.getTaskAttemptID.getTaskID.getId + val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context) + val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") + val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context) + val split = taskAttemptId.getTaskID.getId new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension") } } @@ -87,7 +94,9 @@ private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext outputFormat.getRecordWriter(context) } - override def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row) + override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal") + + override protected[sql] def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row) override def close(): Unit = recordWriter.close(context) } @@ -204,6 +213,13 @@ private[sql] class ParquetRelation( override def prepareJobForWrite(job: Job): OutputWriterFactory = { val conf = ContextUtil.getConfiguration(job) + // SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible + val committerClassname = conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) + if (committerClassname == "org.apache.spark.sql.parquet.DirectParquetOutputCommitter") { + conf.set(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key, + classOf[DirectParquetOutputCommitter].getCanonicalName) + } + val committerClass = conf.getClass( SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key, @@ -269,12 +285,18 @@ private[sql] class ParquetRelation( val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec + // Parquet row group size. We will use this value as the value for + // mapreduce.input.fileinputformat.split.minsize and mapred.min.split.size if the value + // of these flags are smaller than the parquet row group size. + val parquetBlockSize = ParquetOutputFormat.getLongBlockSize(broadcastedConf.value.value) + // Create the function to set variable Parquet confs at both driver and executor side. val initLocalJobFuncOpt = ParquetRelation.initializeLocalJobFunc( requiredColumns, filters, dataSchema, + parquetBlockSize, useMetadataCache, parquetFilterPushDown, assumeBinaryIsString, @@ -282,7 +304,8 @@ private[sql] class ParquetRelation( followParquetFormatSpec) _ // Create the function to set input paths at the driver side. - val setInputPaths = ParquetRelation.initializeDriverSideJobFunc(inputFiles) _ + val setInputPaths = + ParquetRelation.initializeDriverSideJobFunc(inputFiles, parquetBlockSize) _ Utils.withDummyCallSite(sqlContext.sparkContext) { new SqlNewHadoopRDD( @@ -291,7 +314,6 @@ private[sql] class ParquetRelation( initDriverSideJobFuncOpt = Some(setInputPaths), initLocalJobFuncOpt = Some(initLocalJobFuncOpt), inputFormatClass = classOf[ParquetInputFormat[InternalRow]], - keyClass = classOf[Void], valueClass = classOf[InternalRow]) { val cacheMetadata = useMetadataCache @@ -328,7 +350,7 @@ private[sql] class ParquetRelation( new SqlNewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } } - }.values.asInstanceOf[RDD[Row]] // type erasure hack to pass RDD[InternalRow] as RDD[Row] + }.asInstanceOf[RDD[Row]] // type erasure hack to pass RDD[InternalRow] as RDD[Row] } } @@ -471,11 +493,35 @@ private[sql] object ParquetRelation extends Logging { // internally. private[sql] val METASTORE_SCHEMA = "metastoreSchema" + /** + * If parquet's block size (row group size) setting is larger than the min split size, + * we use parquet's block size setting as the min split size. Otherwise, we will create + * tasks processing nothing (because a split does not cover the starting point of a + * parquet block). See https://issues.apache.org/jira/browse/SPARK-10143 for more information. + */ + private def overrideMinSplitSize(parquetBlockSize: Long, conf: Configuration): Unit = { + val minSplitSize = + math.max( + conf.getLong("mapred.min.split.size", 0L), + conf.getLong("mapreduce.input.fileinputformat.split.minsize", 0L)) + if (parquetBlockSize > minSplitSize) { + val message = + s"Parquet's block size (row group size) is larger than " + + s"mapred.min.split.size/mapreduce.input.fileinputformat.split.minsize. Setting " + + s"mapred.min.split.size and mapreduce.input.fileinputformat.split.minsize to " + + s"$parquetBlockSize." + logDebug(message) + conf.set("mapred.min.split.size", parquetBlockSize.toString) + conf.set("mapreduce.input.fileinputformat.split.minsize", parquetBlockSize.toString) + } + } + /** This closure sets various Parquet configurations at both driver side and executor side. */ private[parquet] def initializeLocalJobFunc( requiredColumns: Array[String], filters: Array[Filter], dataSchema: StructType, + parquetBlockSize: Long, useMetadataCache: Boolean, parquetFilterPushDown: Boolean, assumeBinaryIsString: Boolean, @@ -511,16 +557,21 @@ private[sql] object ParquetRelation extends Logging { conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString) conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp) conf.setBoolean(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, followParquetFormatSpec) + + overrideMinSplitSize(parquetBlockSize, conf) } /** This closure sets input paths at the driver side. */ private[parquet] def initializeDriverSideJobFunc( - inputFiles: Array[FileStatus])(job: Job): Unit = { + inputFiles: Array[FileStatus], + parquetBlockSize: Long)(job: Job): Unit = { // We side the input paths at the driver side. logInfo(s"Reading Parquet file(s) from ${inputFiles.map(_.getPath).mkString(", ")}") if (inputFiles.nonEmpty) { FileInputFormat.setInputPaths(job, inputFiles.map(_.getPath): _*) } + + overrideMinSplitSize(parquetBlockSize, job.getConfiguration) } private[parquet] def readSchema( @@ -667,7 +718,7 @@ private[sql] object ParquetRelation extends Logging { val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec val serializedConf = new SerializableConfiguration(sqlContext.sparkContext.hadoopConfiguration) - // HACK ALERT: + // !! HACK ALERT !! // // Parquet requires `FileStatus`es to read footers. Here we try to send cached `FileStatus`es // to executor side to avoid fetching them again. However, `FileStatus` is not `Serializable` @@ -748,38 +799,39 @@ private[sql] object ParquetRelation extends Logging { }.toOption } - def enableLogForwarding() { - // Note: the org.apache.parquet.Log class has a static initializer that - // sets the java.util.logging Logger for "org.apache.parquet". This - // checks first to see if there's any handlers already set - // and if not it creates them. If this method executes prior - // to that class being loaded then: - // 1) there's no handlers installed so there's none to - // remove. But when it IS finally loaded the desired affect - // of removing them is circumvented. - // 2) The parquet.Log static initializer calls setUseParentHandlers(false) - // undoing the attempt to override the logging here. - // - // Therefore we need to force the class to be loaded. - // This should really be resolved by Parquet. - Utils.classForName(classOf[ParquetLog].getName) - - // Note: Logger.getLogger("parquet") has a default logger - // that appends to Console which needs to be cleared. - val parquetLogger = JLogger.getLogger(classOf[ParquetLog].getPackage.getName) - parquetLogger.getHandlers.foreach(parquetLogger.removeHandler) - parquetLogger.setUseParentHandlers(true) - - // Disables a WARN log message in ParquetOutputCommitter. We first ensure that - // ParquetOutputCommitter is loaded and the static LOG field gets initialized. - // See https://issues.apache.org/jira/browse/SPARK-5968 for details - Utils.classForName(classOf[ParquetOutputCommitter].getName) - JLogger.getLogger(classOf[ParquetOutputCommitter].getName).setLevel(Level.OFF) - - // Similar as above, disables a unnecessary WARN log message in ParquetRecordReader. - // See https://issues.apache.org/jira/browse/PARQUET-220 for details - Utils.classForName(classOf[ParquetRecordReader[_]].getName) - JLogger.getLogger(classOf[ParquetRecordReader[_]].getName).setLevel(Level.OFF) + // JUL loggers must be held by a strong reference, otherwise they may get destroyed by GC. + // However, the root JUL logger used by Parquet isn't properly referenced. Here we keep + // references to loggers in both parquet-mr <= 1.6 and >= 1.7 + val apacheParquetLogger: JLogger = JLogger.getLogger(classOf[ApacheParquetLog].getPackage.getName) + val parquetLogger: JLogger = JLogger.getLogger("parquet") + + // Parquet initializes its own JUL logger in a static block which always prints to stdout. Here + // we redirect the JUL logger via SLF4J JUL bridge handler. + val redirectParquetLogsViaSLF4J: Unit = { + def redirect(logger: JLogger): Unit = { + logger.getHandlers.foreach(logger.removeHandler) + logger.setUseParentHandlers(false) + logger.addHandler(new SLF4JBridgeHandler) + } + + // For parquet-mr 1.7.0 and above versions, which are under `org.apache.parquet` namespace. + // scalastyle:off classforname + Class.forName(classOf[ApacheParquetLog].getName) + // scalastyle:on classforname + redirect(JLogger.getLogger(classOf[ApacheParquetLog].getPackage.getName)) + + // For parquet-mr 1.6.0 and lower versions bundled with Hive, which are under `parquet` + // namespace. + try { + // scalastyle:off classforname + Class.forName("parquet.Log") + // scalastyle:on classforname + redirect(JLogger.getLogger("parquet")) + } catch { case _: Throwable => + // SPARK-9974: com.twitter:parquet-hadoop-bundle:1.6.0 is not packaged into the assembly jar + // when Spark is built with SBT. So `parquet.Log` may not be found. This try/catch block + // should be removed after this issue is fixed. + } } // The parquet compression short names diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala similarity index 99% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala index 9cd0250f9c51..ed89aa27aa1f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import java.math.BigInteger import java.nio.{ByteBuffer, ByteOrder} @@ -52,7 +52,6 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo } log.debug(s"write support initialized for requested schema $attributes") - ParquetRelation.enableLogForwarding() new WriteSupport.WriteContext(ParquetTypesConverter.convertFromAttributes(attributes), metadata) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala similarity index 98% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala index 3854f5bd39fb..42376ef7a9c1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import java.io.IOException @@ -104,7 +104,6 @@ private[parquet] object ParquetTypesConverter extends Logging { extraMetadata, "Spark") - ParquetRelation.enableLogForwarding() ParquetFileWriter.writeMetadataFile( conf, path, @@ -140,8 +139,6 @@ private[parquet] object ParquetTypesConverter extends Logging { (name(0) == '.' || name(0) == '_') && name != ParquetFileWriter.PARQUET_METADATA_FILE } - ParquetRelation.enableLogForwarding() - // NOTE (lian): Parquet "_metadata" file can be very slow if the file consists of lots of row // groups. Since Parquet schema is replicated among all row groups, we only need to touch a // single row group to read schema related metadata. Notice that we are making assumptions that diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 11bb49b8d83d..d9f013191947 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -101,7 +101,8 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan => } } - case logical.InsertIntoTable(LogicalRelation(r: HadoopFsRelation), part, _, _, _) => + case logical.InsertIntoTable( + LogicalRelation(r: HadoopFsRelation), part, query, overwrite, _) => // We need to make sure the partition columns specified by users do match partition // columns of the relation. val existingPartitionColumns = r.partitionColumns.fieldNames.toSet @@ -115,6 +116,17 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan => // OK } + // Get all input data source relations of the query. + val srcRelations = query.collect { + case LogicalRelation(src: BaseRelation) => src + } + if (srcRelations.contains(r)) { + failAnalysis( + "Cannot insert overwrite into table that is also being read from.") + } else { + // OK + } + case logical.InsertIntoTable(l: LogicalRelation, _, _, _, _) => // The relation in l is not an InsertableRelation. failAnalysis(s"$l does not allow insertion.") @@ -126,12 +138,12 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan => // OK } - case CreateTableUsingAsSelect(tableName, _, _, _, SaveMode.Overwrite, _, query) => + case CreateTableUsingAsSelect(tableIdent, _, _, _, SaveMode.Overwrite, _, query) => // When the SaveMode is Overwrite, we need to check if the table is an input table of // the query. If so, we will throw an AnalysisException to let users know it is not allowed. - if (catalog.tableExists(Seq(tableName))) { + if (catalog.tableExists(tableIdent.toSeq)) { // Need to remove SubQuery operator. - EliminateSubQueries(catalog.lookupRelation(Seq(tableName))) match { + EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq)) match { // Only do the check if the table is a data source table // (the relation is a BaseRelation). case l @ LogicalRelation(dest: BaseRelation) => @@ -141,7 +153,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan => } if (srcRelations.contains(dest)) { failAnalysis( - s"Cannot overwrite table $tableName that is also being read from.") + s"Cannot overwrite table $tableIdent that is also being read from.") } else { // OK } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index c37007f1eece..74892e4e13fa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -17,21 +17,16 @@ package org.apache.spark.sql.execution -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.unsafe.types.UTF8String - import scala.collection.mutable.HashSet -import org.apache.spark.{AccumulatorParam, Accumulator, Logging} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.rdd.RDD import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.trees.TreeNodeRef -import org.apache.spark.sql.types._ +import org.apache.spark.{Accumulator, AccumulatorParam, Logging} /** - * :: DeveloperApi :: * Contains methods for debugging query execution. * * Usage: @@ -53,10 +48,8 @@ package object debug { } /** - * :: DeveloperApi :: * Augments [[DataFrame]]s with debug methods. */ - @DeveloperApi implicit class DebugQuery(query: DataFrame) extends Logging { def debug(): Unit = { val plan = query.queryExecution.executedPlan @@ -72,23 +65,6 @@ package object debug { case _ => } } - - def typeCheck(): Unit = { - val plan = query.queryExecution.executedPlan - val visited = new collection.mutable.HashSet[TreeNodeRef]() - val debugPlan = plan transform { - case s: SparkPlan if !visited.contains(new TreeNodeRef(s)) => - visited += new TreeNodeRef(s) - TypeCheck(s) - } - try { - logDebug(s"Results returned: ${debugPlan.execute().count()}") - } catch { - case e: Exception => - def unwrap(e: Throwable): Throwable = if (e.getCause == null) e else unwrap(e.getCause) - logDebug(s"Deepest Error: ${unwrap(e)}") - } - } } private[sql] case class DebugNode(child: SparkPlan) extends UnaryNode { @@ -148,76 +124,4 @@ package object debug { } } } - - /** - * Helper functions for checking that runtime types match a given schema. - */ - private[sql] object TypeCheck { - def typeCheck(data: Any, schema: DataType): Unit = (data, schema) match { - case (null, _) => - - case (row: InternalRow, StructType(fields)) => - row.toSeq.zip(fields.map(_.dataType)).foreach { case(d, t) => typeCheck(d, t) } - case (a: ArrayData, ArrayType(elemType, _)) => - a.foreach(elemType, (_, e) => { - typeCheck(e, elemType) - }) - case (m: MapData, MapType(keyType, valueType, _)) => - m.keyArray().foreach(keyType, (_, e) => { - typeCheck(e, keyType) - }) - m.valueArray().foreach(valueType, (_, e) => { - typeCheck(e, valueType) - }) - - case (_: Long, LongType) => - case (_: Int, IntegerType) => - case (_: UTF8String, StringType) => - case (_: Float, FloatType) => - case (_: Byte, ByteType) => - case (_: Short, ShortType) => - case (_: Boolean, BooleanType) => - case (_: Double, DoubleType) => - case (_: Int, DateType) => - case (_: Long, TimestampType) => - case (v, udt: UserDefinedType[_]) => typeCheck(v, udt.sqlType) - - case (d, t) => sys.error(s"Invalid data found: got $d (${d.getClass}) expected $t") - } - } - - /** - * Augments [[DataFrame]]s with debug methods. - */ - private[sql] case class TypeCheck(child: SparkPlan) extends SparkPlan { - import TypeCheck._ - - override def nodeName: String = "" - - /* Only required when defining this class in a REPL. - override def makeCopy(args: Array[Object]): this.type = - TypeCheck(args(0).asInstanceOf[SparkPlan]).asInstanceOf[this.type] - */ - - def output: Seq[Attribute] = child.output - - def children: List[SparkPlan] = child :: Nil - - protected override def doExecute(): RDD[InternalRow] = { - child.execute().map { row => - try typeCheck(row, child.schema) catch { - case e: Exception => - sys.error( - s""" - |ERROR WHEN TYPE CHECKING QUERY - |============================== - |$e - |======== BAD TREE ============ - |$child - """.stripMargin) - } - row - } - } - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala index 624efc1b1d73..2e108cb81451 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala @@ -25,8 +25,10 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning, UnspecifiedDistribution} -import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} +import org.apache.spark.sql.execution.{BinaryNode, SQLExecution, SparkPlan} +import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.ThreadUtils +import org.apache.spark.{InternalAccumulator, TaskContext} /** * :: DeveloperApi :: @@ -44,6 +46,11 @@ case class BroadcastHashJoin( right: SparkPlan) extends BinaryNode with HashJoin { + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + val timeout: Duration = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { @@ -58,25 +65,65 @@ case class BroadcastHashJoin( override def requiredChildDistribution: Seq[Distribution] = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil + // Use lazy so that we won't do broadcast when calling explain but still cache the broadcast value + // for the same query. @transient - private val broadcastFuture = future { - // Note that we use .execute().collect() because we don't want to convert data to Scala types - val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect() - val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.size) - sparkContext.broadcast(hashed) - }(BroadcastHashJoin.broadcastHashJoinExecutionContext) + private lazy val broadcastFuture = { + val numBuildRows = buildSide match { + case BuildLeft => longMetric("numLeftRows") + case BuildRight => longMetric("numRightRows") + } + + // broadcastFuture is used in "doExecute". Therefore we can get the execution id correctly here. + val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + future { + // This will run in another thread. Set the execution id so that we can connect these jobs + // with the correct execution. + SQLExecution.withExecutionId(sparkContext, executionId) { + // Note that we use .execute().collect() because we don't want to convert data to Scala + // types + val input: Array[InternalRow] = buildPlan.execute().map { row => + numBuildRows += 1 + row.copy() + }.collect() + // The following line doesn't run in a job so we cannot track the metric value. However, we + // have already tracked it in the above lines. So here we can use + // `SQLMetrics.nullLongMetric` to ignore it. + val hashed = HashedRelation( + input.iterator, SQLMetrics.nullLongMetric, buildSideKeyGenerator, input.size) + sparkContext.broadcast(hashed) + } + }(BroadcastHashJoin.broadcastHashJoinExecutionContext) + } + + protected override def doPrepare(): Unit = { + broadcastFuture + } protected override def doExecute(): RDD[InternalRow] = { + val numStreamedRows = buildSide match { + case BuildLeft => longMetric("numRightRows") + case BuildRight => longMetric("numLeftRows") + } + val numOutputRows = longMetric("numOutputRows") + val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamedIter => - hashJoin(streamedIter, broadcastRelation.value) + val hashedRelation = broadcastRelation.value + hashedRelation match { + case unsafe: UnsafeHashedRelation => + TaskContext.get().internalMetricsToAccumulators( + InternalAccumulator.PEAK_EXECUTION_MEMORY).add(unsafe.getUnsafeSize) + case _ => + } + hashJoin(streamedIter, numStreamedRows, hashedRelation, numOutputRows) } } } object BroadcastHashJoin { - private val broadcastHashJoinExecutionContext = ExecutionContext.fromExecutorService( + private[joins] val broadcastHashJoinExecutionContext = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-join", 128)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala index 309716a0efcc..69a8b95eaa7e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala @@ -24,10 +24,11 @@ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, Distribution, UnspecifiedDistribution} +import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.catalyst.plans.{JoinType, LeftOuter, RightOuter} -import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} -import org.apache.spark.util.ThreadUtils +import org.apache.spark.sql.execution.{BinaryNode, SQLExecution, SparkPlan} +import org.apache.spark.sql.execution.metric.SQLMetrics +import org.apache.spark.{InternalAccumulator, TaskContext} /** * :: DeveloperApi :: @@ -45,6 +46,11 @@ case class BroadcastHashOuterJoin( left: SparkPlan, right: SparkPlan) extends BinaryNode with HashOuterJoin { + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { @@ -59,15 +65,54 @@ case class BroadcastHashOuterJoin( override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning + // Use lazy so that we won't do broadcast when calling explain but still cache the broadcast value + // for the same query. @transient - private val broadcastFuture = future { - // Note that we use .execute().collect() because we don't want to convert data to Scala types - val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect() - val hashed = HashedRelation(input.iterator, buildKeyGenerator, input.size) - sparkContext.broadcast(hashed) - }(BroadcastHashOuterJoin.broadcastHashOuterJoinExecutionContext) + private lazy val broadcastFuture = { + val numBuildRows = joinType match { + case RightOuter => longMetric("numLeftRows") + case LeftOuter => longMetric("numRightRows") + case x => + throw new IllegalArgumentException( + s"HashOuterJoin should not take $x as the JoinType") + } + + // broadcastFuture is used in "doExecute". Therefore we can get the execution id correctly here. + val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + future { + // This will run in another thread. Set the execution id so that we can connect these jobs + // with the correct execution. + SQLExecution.withExecutionId(sparkContext, executionId) { + // Note that we use .execute().collect() because we don't want to convert data to Scala + // types + val input: Array[InternalRow] = buildPlan.execute().map { row => + numBuildRows += 1 + row.copy() + }.collect() + // The following line doesn't run in a job so we cannot track the metric value. However, we + // have already tracked it in the above lines. So here we can use + // `SQLMetrics.nullLongMetric` to ignore it. + val hashed = HashedRelation( + input.iterator, SQLMetrics.nullLongMetric, buildKeyGenerator, input.size) + sparkContext.broadcast(hashed) + } + }(BroadcastHashJoin.broadcastHashJoinExecutionContext) + } + + protected override def doPrepare(): Unit = { + broadcastFuture + } override def doExecute(): RDD[InternalRow] = { + val numStreamedRows = joinType match { + case RightOuter => longMetric("numRightRows") + case LeftOuter => longMetric("numLeftRows") + case x => + throw new IllegalArgumentException( + s"HashOuterJoin should not take $x as the JoinType") + } + val numOutputRows = longMetric("numOutputRows") + val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamedIter => @@ -75,19 +120,29 @@ case class BroadcastHashOuterJoin( val hashTable = broadcastRelation.value val keyGenerator = streamedKeyGenerator + hashTable match { + case unsafe: UnsafeHashedRelation => + TaskContext.get().internalMetricsToAccumulators( + InternalAccumulator.PEAK_EXECUTION_MEMORY).add(unsafe.getUnsafeSize) + case _ => + } + + val resultProj = resultProjection joinType match { case LeftOuter => streamedIter.flatMap(currentRow => { + numStreamedRows += 1 val rowKey = keyGenerator(currentRow) joinedRow.withLeft(currentRow) - leftOuterIterator(rowKey, joinedRow, hashTable.get(rowKey)) + leftOuterIterator(rowKey, joinedRow, hashTable.get(rowKey), resultProj, numOutputRows) }) case RightOuter => streamedIter.flatMap(currentRow => { + numStreamedRows += 1 val rowKey = keyGenerator(currentRow) joinedRow.withRight(currentRow) - rightOuterIterator(rowKey, hashTable.get(rowKey), joinedRow) + rightOuterIterator(rowKey, hashTable.get(rowKey), joinedRow, resultProj, numOutputRows) }) case x => @@ -97,9 +152,3 @@ case class BroadcastHashOuterJoin( } } } - -object BroadcastHashOuterJoin { - - private val broadcastHashOuterJoinExecutionContext = ExecutionContext.fromExecutorService( - ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-outer-join", 128)) -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala index a60593911f94..78a8c16c62bc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala @@ -17,11 +17,13 @@ package org.apache.spark.sql.execution.joins +import org.apache.spark.{InternalAccumulator, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} +import org.apache.spark.sql.execution.metric.SQLMetrics /** * :: DeveloperApi :: @@ -36,22 +38,42 @@ case class BroadcastLeftSemiJoinHash( right: SparkPlan, condition: Option[Expression]) extends BinaryNode with HashSemiJoin { + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + protected override def doExecute(): RDD[InternalRow] = { - val input = right.execute().map(_.copy()).collect() + val numLeftRows = longMetric("numLeftRows") + val numRightRows = longMetric("numRightRows") + val numOutputRows = longMetric("numOutputRows") + + val input = right.execute().map { row => + numRightRows += 1 + row.copy() + }.collect() if (condition.isEmpty) { - val hashSet = buildKeyHashSet(input.toIterator) + val hashSet = buildKeyHashSet(input.toIterator, SQLMetrics.nullLongMetric) val broadcastedRelation = sparkContext.broadcast(hashSet) left.execute().mapPartitions { streamIter => - hashSemiJoin(streamIter, broadcastedRelation.value) + hashSemiJoin(streamIter, numLeftRows, broadcastedRelation.value, numOutputRows) } } else { - val hashRelation = HashedRelation(input.toIterator, rightKeyGenerator, input.size) + val hashRelation = + HashedRelation(input.toIterator, SQLMetrics.nullLongMetric, rightKeyGenerator, input.size) val broadcastedRelation = sparkContext.broadcast(hashRelation) left.execute().mapPartitions { streamIter => - hashSemiJoin(streamIter, broadcastedRelation.value) + val hashedRelation = broadcastedRelation.value + hashedRelation match { + case unsafe: UnsafeHashedRelation => + TaskContext.get().internalMetricsToAccumulators( + InternalAccumulator.PEAK_EXECUTION_MEMORY).add(unsafe.getUnsafeSize) + case _ => + } + hashSemiJoin(streamIter, numLeftRows, hashedRelation, numOutputRows) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala index 83b726a8e289..28c88b1b03d0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} +import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.collection.CompactBuffer /** @@ -38,6 +39,11 @@ case class BroadcastNestedLoopJoin( condition: Option[Expression]) extends BinaryNode { // TODO: Override requiredChildDistribution. + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + /** BuildRight means the right relation <=> the broadcast relation. */ private val (streamed, broadcast) = buildSide match { case BuildRight => (left, right) @@ -47,7 +53,7 @@ case class BroadcastNestedLoopJoin( override def outputsUnsafeRows: Boolean = left.outputsUnsafeRows || right.outputsUnsafeRows override def canProcessUnsafeRows: Boolean = true - @transient private[this] lazy val resultProjection: InternalRow => InternalRow = { + private[this] def genResultProjection: InternalRow => InternalRow = { if (outputsUnsafeRows) { UnsafeProjection.create(schema) } else { @@ -65,8 +71,9 @@ case class BroadcastNestedLoopJoin( left.output.map(_.withNullability(true)) ++ right.output case FullOuter => left.output.map(_.withNullability(true)) ++ right.output.map(_.withNullability(true)) - case _ => - left.output ++ right.output + case x => + throw new IllegalArgumentException( + s"BroadcastNestedLoopJoin should not take $x as the JoinType") } } @@ -74,9 +81,17 @@ case class BroadcastNestedLoopJoin( newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output) protected override def doExecute(): RDD[InternalRow] = { + val (numStreamedRows, numBuildRows) = buildSide match { + case BuildRight => (longMetric("numLeftRows"), longMetric("numRightRows")) + case BuildLeft => (longMetric("numRightRows"), longMetric("numLeftRows")) + } + val numOutputRows = longMetric("numOutputRows") + val broadcastedRelation = - sparkContext.broadcast(broadcast.execute().map(_.copy()) - .collect().toIndexedSeq) + sparkContext.broadcast(broadcast.execute().map { row => + numBuildRows += 1 + row.copy() + }.collect().toIndexedSeq) /** All rows that either match both-way, or rows from streamed joined with nulls. */ val matchesOrStreamedRowsWithNulls = streamed.execute().mapPartitions { streamedIter => @@ -88,20 +103,22 @@ case class BroadcastNestedLoopJoin( val leftNulls = new GenericMutableRow(left.output.size) val rightNulls = new GenericMutableRow(right.output.size) + val resultProj = genResultProjection streamedIter.foreach { streamedRow => var i = 0 var streamRowMatched = false + numStreamedRows += 1 while (i < broadcastedRelation.value.size) { val broadcastedRow = broadcastedRelation.value(i) buildSide match { case BuildRight if boundCondition(joinedRow(streamedRow, broadcastedRow)) => - matchedRows += resultProjection(joinedRow(streamedRow, broadcastedRow)).copy() + matchedRows += resultProj(joinedRow(streamedRow, broadcastedRow)).copy() streamRowMatched = true includedBroadcastTuples += i case BuildLeft if boundCondition(joinedRow(broadcastedRow, streamedRow)) => - matchedRows += resultProjection(joinedRow(broadcastedRow, streamedRow)).copy() + matchedRows += resultProj(joinedRow(broadcastedRow, streamedRow)).copy() streamRowMatched = true includedBroadcastTuples += i case _ => @@ -111,9 +128,9 @@ case class BroadcastNestedLoopJoin( (streamRowMatched, joinType, buildSide) match { case (false, LeftOuter | FullOuter, BuildRight) => - matchedRows += resultProjection(joinedRow(streamedRow, rightNulls)).copy() + matchedRows += resultProj(joinedRow(streamedRow, rightNulls)).copy() case (false, RightOuter | FullOuter, BuildLeft) => - matchedRows += resultProjection(joinedRow(leftNulls, streamedRow)).copy() + matchedRows += resultProj(joinedRow(leftNulls, streamedRow)).copy() case _ => } } @@ -127,6 +144,8 @@ case class BroadcastNestedLoopJoin( val leftNulls = new GenericMutableRow(left.output.size) val rightNulls = new GenericMutableRow(right.output.size) + val resultProj = genResultProjection + /** Rows from broadcasted joined with nulls. */ val broadcastRowsWithNulls: Seq[InternalRow] = { val buf: CompactBuffer[InternalRow] = new CompactBuffer() @@ -138,7 +157,7 @@ case class BroadcastNestedLoopJoin( joinedRow.withLeft(leftNulls) while (i < rel.length) { if (!allIncludedBroadcastTuples.contains(i)) { - buf += resultProjection(joinedRow.withRight(rel(i))).copy() + buf += resultProj(joinedRow.withRight(rel(i))).copy() } i += 1 } @@ -147,7 +166,7 @@ case class BroadcastNestedLoopJoin( joinedRow.withRight(rightNulls) while (i < rel.length) { if (!allIncludedBroadcastTuples.contains(i)) { - buf += resultProjection(joinedRow.withLeft(rel(i))).copy() + buf += resultProj(joinedRow.withLeft(rel(i))).copy() } i += 1 } @@ -158,6 +177,12 @@ case class BroadcastNestedLoopJoin( // TODO: Breaks lineage. sparkContext.union( - matchesOrStreamedRowsWithNulls.flatMap(_._1), sparkContext.makeRDD(broadcastRowsWithNulls)) + matchesOrStreamedRowsWithNulls.flatMap(_._1), + sparkContext.makeRDD(broadcastRowsWithNulls) + ).map { row => + // `broadcastRowsWithNulls` doesn't run in a job so that we have to track numOutputRows here. + numOutputRows += 1 + row + } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala index 261b4724159f..2115f4070228 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala @@ -22,6 +22,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} +import org.apache.spark.sql.execution.metric.SQLMetrics /** * :: DeveloperApi :: @@ -30,13 +31,31 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + protected override def doExecute(): RDD[InternalRow] = { - val leftResults = left.execute().map(_.copy()) - val rightResults = right.execute().map(_.copy()) + val numLeftRows = longMetric("numLeftRows") + val numRightRows = longMetric("numRightRows") + val numOutputRows = longMetric("numOutputRows") + + val leftResults = left.execute().map { row => + numLeftRows += 1 + row.copy() + } + val rightResults = right.execute().map { row => + numRightRows += 1 + row.copy() + } leftResults.cartesian(rightResults).mapPartitions { iter => val joinedRow = new JoinedRow - iter.map(r => joinedRow(r._1, r._2)) + iter.map { r => + numOutputRows += 1 + joinedRow(r._1, r._2) + } } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala index 6b3d1652923f..7ce4a517838c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.joins import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.LongSQLMetric trait HashJoin { @@ -44,7 +45,8 @@ trait HashJoin { override def output: Seq[Attribute] = left.output ++ right.output protected[this] def isUnsafeMode: Boolean = { - (self.codegenEnabled && UnsafeProjection.canSupport(buildKeys) + (self.codegenEnabled && self.unsafeEnabled + && UnsafeProjection.canSupport(buildKeys) && UnsafeProjection.canSupport(self.schema)) } @@ -52,14 +54,14 @@ trait HashJoin { override def canProcessUnsafeRows: Boolean = isUnsafeMode override def canProcessSafeRows: Boolean = !isUnsafeMode - @transient protected lazy val buildSideKeyGenerator: Projection = + protected def buildSideKeyGenerator: Projection = if (isUnsafeMode) { UnsafeProjection.create(buildKeys, buildPlan.output) } else { newMutableProjection(buildKeys, buildPlan.output)() } - @transient protected lazy val streamSideKeyGenerator: Projection = + protected def streamSideKeyGenerator: Projection = if (isUnsafeMode) { UnsafeProjection.create(streamedKeys, streamedPlan.output) } else { @@ -68,7 +70,9 @@ trait HashJoin { protected def hashJoin( streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = + numStreamRows: LongSQLMetric, + hashedRelation: HashedRelation, + numOutputRows: LongSQLMetric): Iterator[InternalRow] = { new Iterator[InternalRow] { private[this] var currentStreamedRow: InternalRow = _ @@ -97,6 +101,7 @@ trait HashJoin { case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow) } currentMatchPosition += 1 + numOutputRows += 1 resultProjection(ret) } @@ -112,6 +117,7 @@ trait HashJoin { while (currentHashMatches == null && streamIter.hasNext) { currentStreamedRow = streamIter.next() + numStreamRows += 1 val key = joinKeys(currentStreamedRow) if (!key.anyNull) { currentHashMatches = hashedRelation.get(key) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala index a323aea4ea2c..66903347c88c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.LongSQLMetric import org.apache.spark.util.collection.CompactBuffer @DeveloperApi @@ -67,7 +68,7 @@ trait HashOuterJoin { } protected[this] def isUnsafeMode: Boolean = { - (self.codegenEnabled && joinType != FullOuter + (self.codegenEnabled && self.unsafeEnabled && joinType != FullOuter && UnsafeProjection.canSupport(buildKeys) && UnsafeProjection.canSupport(self.schema)) } @@ -76,14 +77,14 @@ trait HashOuterJoin { override def canProcessUnsafeRows: Boolean = isUnsafeMode override def canProcessSafeRows: Boolean = !isUnsafeMode - @transient protected lazy val buildKeyGenerator: Projection = + protected def buildKeyGenerator: Projection = if (isUnsafeMode) { UnsafeProjection.create(buildKeys, buildPlan.output) } else { newMutableProjection(buildKeys, buildPlan.output)() } - @transient protected[this] lazy val streamedKeyGenerator: Projection = { + protected[this] def streamedKeyGenerator: Projection = { if (isUnsafeMode) { UnsafeProjection.create(streamedKeys, streamedPlan.output) } else { @@ -91,7 +92,7 @@ trait HashOuterJoin { } } - @transient private[this] lazy val resultProjection: InternalRow => InternalRow = { + protected[this] def resultProjection: InternalRow => InternalRow = { if (isUnsafeMode) { UnsafeProjection.create(self.schema) } else { @@ -113,23 +114,30 @@ trait HashOuterJoin { protected[this] def leftOuterIterator( key: InternalRow, joinedRow: JoinedRow, - rightIter: Iterable[InternalRow]): Iterator[InternalRow] = { + rightIter: Iterable[InternalRow], + resultProjection: InternalRow => InternalRow, + numOutputRows: LongSQLMetric): Iterator[InternalRow] = { val ret: Iterable[InternalRow] = { if (!key.anyNull) { val temp = if (rightIter != null) { rightIter.collect { - case r if boundCondition(joinedRow.withRight(r)) => resultProjection(joinedRow).copy() + case r if boundCondition(joinedRow.withRight(r)) => { + numOutputRows += 1 + resultProjection(joinedRow).copy() + } } } else { List.empty } if (temp.isEmpty) { - resultProjection(joinedRow.withRight(rightNullRow)).copy :: Nil + numOutputRows += 1 + resultProjection(joinedRow.withRight(rightNullRow)) :: Nil } else { temp } } else { - resultProjection(joinedRow.withRight(rightNullRow)).copy :: Nil + numOutputRows += 1 + resultProjection(joinedRow.withRight(rightNullRow)) :: Nil } } ret.iterator @@ -138,24 +146,30 @@ trait HashOuterJoin { protected[this] def rightOuterIterator( key: InternalRow, leftIter: Iterable[InternalRow], - joinedRow: JoinedRow): Iterator[InternalRow] = { + joinedRow: JoinedRow, + resultProjection: InternalRow => InternalRow, + numOutputRows: LongSQLMetric): Iterator[InternalRow] = { val ret: Iterable[InternalRow] = { if (!key.anyNull) { val temp = if (leftIter != null) { leftIter.collect { - case l if boundCondition(joinedRow.withLeft(l)) => + case l if boundCondition(joinedRow.withLeft(l)) => { + numOutputRows += 1 resultProjection(joinedRow).copy() + } } } else { List.empty } if (temp.isEmpty) { - resultProjection(joinedRow.withLeft(leftNullRow)).copy :: Nil + numOutputRows += 1 + resultProjection(joinedRow.withLeft(leftNullRow)) :: Nil } else { temp } } else { - resultProjection(joinedRow.withLeft(leftNullRow)).copy :: Nil + numOutputRows += 1 + resultProjection(joinedRow.withLeft(leftNullRow)) :: Nil } } ret.iterator @@ -163,7 +177,7 @@ trait HashOuterJoin { protected[this] def fullOuterIterator( key: InternalRow, leftIter: Iterable[InternalRow], rightIter: Iterable[InternalRow], - joinedRow: JoinedRow): Iterator[InternalRow] = { + joinedRow: JoinedRow, numOutputRows: LongSQLMetric): Iterator[InternalRow] = { if (!key.anyNull) { // Store the positions of records in right, if one of its associated row satisfy // the join condition. @@ -176,6 +190,7 @@ trait HashOuterJoin { // append them directly case (r, idx) if boundCondition(joinedRow.withRight(r)) => + numOutputRows += 1 matched = true // if the row satisfy the join condition, add its index into the matched set rightMatchedSet.add(idx) @@ -188,6 +203,7 @@ trait HashOuterJoin { // as we don't know whether we need to append it until finish iterating all // of the records in right side. // If we didn't get any proper row, then append a single row with empty right. + numOutputRows += 1 joinedRow.withRight(rightNullRow).copy() }) } ++ rightIter.zipWithIndex.collect { @@ -196,12 +212,15 @@ trait HashOuterJoin { // Re-visiting the records in right, and append additional row with empty left, if its not // in the matched set. case (r, idx) if !rightMatchedSet.contains(idx) => + numOutputRows += 1 joinedRow(leftNullRow, r).copy() } } else { leftIter.iterator.map[InternalRow] { l => + numOutputRows += 1 joinedRow(l, rightNullRow).copy() } ++ rightIter.iterator.map[InternalRow] { r => + numOutputRows += 1 joinedRow(leftNullRow, r).copy() } } @@ -210,10 +229,12 @@ trait HashOuterJoin { // This is only used by FullOuter protected[this] def buildHashTable( iter: Iterator[InternalRow], + numIterRows: LongSQLMetric, keyGenerator: Projection): JavaHashMap[InternalRow, CompactBuffer[InternalRow]] = { val hashTable = new JavaHashMap[InternalRow, CompactBuffer[InternalRow]]() while (iter.hasNext) { val currentRow = iter.next() + numIterRows += 1 val rowKey = keyGenerator(currentRow) var existingMatchList = hashTable.get(rowKey) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala index 97fde8f975bf..beb141ade616 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.joins import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.LongSQLMetric trait HashSemiJoin { @@ -33,7 +34,8 @@ trait HashSemiJoin { override def output: Seq[Attribute] = left.output protected[this] def supportUnsafe: Boolean = { - (self.codegenEnabled && UnsafeProjection.canSupport(leftKeys) + (self.codegenEnabled && self.unsafeEnabled + && UnsafeProjection.canSupport(leftKeys) && UnsafeProjection.canSupport(rightKeys) && UnsafeProjection.canSupport(left.schema) && UnsafeProjection.canSupport(right.schema)) @@ -43,14 +45,14 @@ trait HashSemiJoin { override def canProcessUnsafeRows: Boolean = supportUnsafe override def canProcessSafeRows: Boolean = !supportUnsafe - @transient protected lazy val leftKeyGenerator: Projection = + protected def leftKeyGenerator: Projection = if (supportUnsafe) { UnsafeProjection.create(leftKeys, left.output) } else { newMutableProjection(leftKeys, left.output)() } - @transient protected lazy val rightKeyGenerator: Projection = + protected def rightKeyGenerator: Projection = if (supportUnsafe) { UnsafeProjection.create(rightKeys, right.output) } else { @@ -60,14 +62,15 @@ trait HashSemiJoin { @transient private lazy val boundCondition = newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output) - protected def buildKeyHashSet(buildIter: Iterator[InternalRow]): java.util.Set[InternalRow] = { + protected def buildKeyHashSet( + buildIter: Iterator[InternalRow], numBuildRows: LongSQLMetric): java.util.Set[InternalRow] = { val hashSet = new java.util.HashSet[InternalRow]() - var currentRow: InternalRow = null // Create a Hash set of buildKeys val rightKey = rightKeyGenerator while (buildIter.hasNext) { - currentRow = buildIter.next() + val currentRow = buildIter.next() + numBuildRows += 1 val rowKey = rightKey(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) @@ -76,30 +79,41 @@ trait HashSemiJoin { } } } + hashSet } protected def hashSemiJoin( streamIter: Iterator[InternalRow], - hashSet: java.util.Set[InternalRow]): Iterator[InternalRow] = { + numStreamRows: LongSQLMetric, + hashSet: java.util.Set[InternalRow], + numOutputRows: LongSQLMetric): Iterator[InternalRow] = { val joinKeys = leftKeyGenerator streamIter.filter(current => { + numStreamRows += 1 val key = joinKeys(current) - !key.anyNull && hashSet.contains(key) + val r = !key.anyNull && hashSet.contains(key) + if (r) numOutputRows += 1 + r }) } protected def hashSemiJoin( streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { + numStreamRows: LongSQLMetric, + hashedRelation: HashedRelation, + numOutputRows: LongSQLMetric): Iterator[InternalRow] = { val joinKeys = leftKeyGenerator val joinedRow = new JoinedRow streamIter.filter { current => + numStreamRows += 1 val key = joinKeys(current) lazy val rowBuffer = hashedRelation.get(key) - !key.anyNull && rowBuffer != null && rowBuffer.exists { + val r = !key.anyNull && rowBuffer != null && rowBuffer.exists { (row: InternalRow) => boundCondition(joinedRow(current, row)) } + if (r) numOutputRows += 1 + r } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index cc8bbfd2f894..6c0196c21a0d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -17,20 +17,21 @@ package org.apache.spark.sql.execution.joins -import java.io.{IOException, Externalizable, ObjectInput, ObjectOutput} +import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput} import java.nio.ByteOrder import java.util.{HashMap => JavaHashMap} import org.apache.spark.shuffle.ShuffleMemoryManager -import org.apache.spark.{SparkConf, SparkEnv, TaskContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkSqlSerializer -import org.apache.spark.unsafe.PlatformDependent +import org.apache.spark.sql.execution.metric.LongSQLMetric +import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.map.BytesToBytesMap -import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager} +import org.apache.spark.unsafe.memory.{MemoryLocation, ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager} import org.apache.spark.util.Utils import org.apache.spark.util.collection.CompactBuffer +import org.apache.spark.{SparkConf, SparkEnv} /** @@ -65,7 +66,8 @@ private[joins] final class GeneralHashedRelation( private var hashTable: JavaHashMap[InternalRow, CompactBuffer[InternalRow]]) extends HashedRelation with Externalizable { - private def this() = this(null) // Needed for serialization + // Needed for serialization (it is public to make Java serialization work) + def this() = this(null) override def get(key: InternalRow): Seq[InternalRow] = hashTable.get(key) @@ -87,7 +89,8 @@ private[joins] final class UniqueKeyHashedRelation(private var hashTable: JavaHashMap[InternalRow, InternalRow]) extends HashedRelation with Externalizable { - private def this() = this(null) // Needed for serialization + // Needed for serialization (it is public to make Java serialization work) + def this() = this(null) override def get(key: InternalRow): Seq[InternalRow] = { val v = hashTable.get(key) @@ -112,11 +115,13 @@ private[joins] object HashedRelation { def apply( input: Iterator[InternalRow], + numInputRows: LongSQLMetric, keyGenerator: Projection, sizeEstimate: Int = 64): HashedRelation = { if (keyGenerator.isInstanceOf[UnsafeProjection]) { - return UnsafeHashedRelation(input, keyGenerator.asInstanceOf[UnsafeProjection], sizeEstimate) + return UnsafeHashedRelation( + input, numInputRows, keyGenerator.asInstanceOf[UnsafeProjection], sizeEstimate) } // TODO: Use Spark's HashMap implementation. @@ -130,6 +135,7 @@ private[joins] object HashedRelation { // Create a mapping of buildKeys -> rows while (input.hasNext) { currentRow = input.next() + numInputRows += 1 val rowKey = keyGenerator(currentRow) if (!rowKey.anyNull) { val existingMatchList = hashTable.get(rowKey) @@ -183,15 +189,36 @@ private[joins] final class UnsafeHashedRelation( private[joins] def this() = this(null) // Needed for serialization // Use BytesToBytesMap in executor for better performance (it's created when deserialization) + // This is used in broadcast joins and distributed mode only @transient private[this] var binaryMap: BytesToBytesMap = _ + /** + * Return the size of the unsafe map on the executors. + * + * For broadcast joins, this hashed relation is bigger on the driver because it is + * represented as a Java hash map there. While serializing the map to the executors, + * however, we rehash the contents in a binary map to reduce the memory footprint on + * the executors. + * + * For non-broadcast joins or in local mode, return 0. + */ + def getUnsafeSize: Long = { + if (binaryMap != null) { + binaryMap.getTotalMemoryConsumption + } else { + 0 + } + } + override def get(key: InternalRow): Seq[InternalRow] = { val unsafeKey = key.asInstanceOf[UnsafeRow] if (binaryMap != null) { // Used in Broadcast join - val loc = binaryMap.lookup(unsafeKey.getBaseObject, unsafeKey.getBaseOffset, - unsafeKey.getSizeInBytes) + val map = binaryMap // avoid the compiler error + val loc = new map.Location // this could be allocated in stack + binaryMap.safeLookup(unsafeKey.getBaseObject, unsafeKey.getBaseOffset, + unsafeKey.getSizeInBytes, loc) if (loc.isDefined) { val buffer = CompactBuffer[UnsafeRow]() @@ -199,8 +226,8 @@ private[joins] final class UnsafeHashedRelation( var offset = loc.getValueAddress.getBaseOffset val last = loc.getValueAddress.getBaseOffset + loc.getValueLength while (offset < last) { - val numFields = PlatformDependent.UNSAFE.getInt(base, offset) - val sizeInBytes = PlatformDependent.UNSAFE.getInt(base, offset + 4) + val numFields = Platform.getInt(base, offset) + val sizeInBytes = Platform.getInt(base, offset + 4) offset += 8 val row = new UnsafeRow @@ -214,46 +241,73 @@ private[joins] final class UnsafeHashedRelation( } } else { - // Use the JavaHashMap in Local mode or ShuffleHashJoin + // Use the Java HashMap in local mode or for non-broadcast joins (e.g. ShuffleHashJoin) hashTable.get(unsafeKey) } } override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { - out.writeInt(hashTable.size()) - - val iter = hashTable.entrySet().iterator() - while (iter.hasNext) { - val entry = iter.next() - val key = entry.getKey - val values = entry.getValue - - // write all the values as single byte array - var totalSize = 0L - var i = 0 - while (i < values.length) { - totalSize += values(i).getSizeInBytes + 4 + 4 - i += 1 + if (binaryMap != null) { + // This could happen when a cached broadcast object need to be dumped into disk to free memory + out.writeInt(binaryMap.numElements()) + + var buffer = new Array[Byte](64) + def write(addr: MemoryLocation, length: Int): Unit = { + if (buffer.length < length) { + buffer = new Array[Byte](length) + } + Platform.copyMemory(addr.getBaseObject, addr.getBaseOffset, + buffer, Platform.BYTE_ARRAY_OFFSET, length) + out.write(buffer, 0, length) } - assert(totalSize < Integer.MAX_VALUE, "values are too big") - - // [key size] [values size] [key bytes] [values bytes] - out.writeInt(key.getSizeInBytes) - out.writeInt(totalSize.toInt) - out.write(key.getBytes) - i = 0 - while (i < values.length) { - // [num of fields] [num of bytes] [row bytes] - // write the integer in native order, so they can be read by UNSAFE.getInt() - if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) { - out.writeInt(values(i).numFields()) - out.writeInt(values(i).getSizeInBytes) - } else { - out.writeInt(Integer.reverseBytes(values(i).numFields())) - out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes)) + + val iter = binaryMap.iterator() + while (iter.hasNext) { + val loc = iter.next() + // [key size] [values size] [key bytes] [values bytes] + out.writeInt(loc.getKeyLength) + out.writeInt(loc.getValueLength) + write(loc.getKeyAddress, loc.getKeyLength) + write(loc.getValueAddress, loc.getValueLength) + } + + } else { + assert(hashTable != null) + out.writeInt(hashTable.size()) + + val iter = hashTable.entrySet().iterator() + while (iter.hasNext) { + val entry = iter.next() + val key = entry.getKey + val values = entry.getValue + + // write all the values as single byte array + var totalSize = 0L + var i = 0 + while (i < values.length) { + totalSize += values(i).getSizeInBytes + 4 + 4 + i += 1 + } + assert(totalSize < Integer.MAX_VALUE, "values are too big") + + // [key size] [values size] [key bytes] [values bytes] + out.writeInt(key.getSizeInBytes) + out.writeInt(totalSize.toInt) + out.write(key.getBytes) + i = 0 + while (i < values.length) { + // [num of fields] [num of bytes] [row bytes] + // write the integer in native order, so they can be read by UNSAFE.getInt() + if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) { + out.writeInt(values(i).numFields()) + out.writeInt(values(i).getSizeInBytes) + } else { + out.writeInt(Integer.reverseBytes(values(i).numFields())) + out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes)) + } + out.write(values(i).getBytes) + i += 1 } - out.write(values(i).getBytes) - i += 1 } } } @@ -263,22 +317,20 @@ private[joins] final class UnsafeHashedRelation( // This is used in Broadcast, shared by multiple tasks, so we use on-heap memory val taskMemoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP)) + val pageSizeBytes = Option(SparkEnv.get).map(_.shuffleMemoryManager.pageSizeBytes) + .getOrElse(new SparkConf().getSizeAsBytes("spark.buffer.pageSize", "16m")) + // Dummy shuffle memory manager which always grants all memory allocation requests. // We use this because it doesn't make sense count shared broadcast variables' memory usage // towards individual tasks' quotas. In the future, we should devise a better way of handling // this. - val shuffleMemoryManager = new ShuffleMemoryManager(new SparkConf()) { - override def tryToAcquire(numBytes: Long): Long = numBytes - override def release(numBytes: Long): Unit = {} - } - - val pageSizeBytes = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf()) - .getSizeAsBytes("spark.buffer.pageSize", "64m") + val shuffleMemoryManager = + ShuffleMemoryManager.create(maxMemory = Long.MaxValue, pageSizeBytes = pageSizeBytes) binaryMap = new BytesToBytesMap( taskMemoryManager, shuffleMemoryManager, - nKeys * 2, // reduce hash collision + (nKeys * 1.5 + 1).toInt, // reduce hash collision pageSizeBytes) var i = 0 @@ -287,20 +339,21 @@ private[joins] final class UnsafeHashedRelation( while (i < nKeys) { val keySize = in.readInt() val valuesSize = in.readInt() - if (keySize > keyBuffer.size) { + if (keySize > keyBuffer.length) { keyBuffer = new Array[Byte](keySize) } in.readFully(keyBuffer, 0, keySize) - if (valuesSize > valuesBuffer.size) { + if (valuesSize > valuesBuffer.length) { valuesBuffer = new Array[Byte](valuesSize) } in.readFully(valuesBuffer, 0, valuesSize) // put it into binary map - val loc = binaryMap.lookup(keyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, keySize) + val loc = binaryMap.lookup(keyBuffer, Platform.BYTE_ARRAY_OFFSET, keySize) assert(!loc.isDefined, "Duplicated key found!") - val putSuceeded = loc.putNewKey(keyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, keySize, - valuesBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, valuesSize) + val putSuceeded = loc.putNewKey( + keyBuffer, Platform.BYTE_ARRAY_OFFSET, keySize, + valuesBuffer, Platform.BYTE_ARRAY_OFFSET, valuesSize) if (!putSuceeded) { throw new IOException("Could not allocate memory to grow BytesToBytesMap") } @@ -313,14 +366,17 @@ private[joins] object UnsafeHashedRelation { def apply( input: Iterator[InternalRow], + numInputRows: LongSQLMetric, keyGenerator: UnsafeProjection, sizeEstimate: Int): HashedRelation = { + // Use a Java hash table here because unsafe maps expect fixed size records val hashTable = new JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]](sizeEstimate) // Create a mapping of buildKeys -> rows while (input.hasNext) { val unsafeRow = input.next().asInstanceOf[UnsafeRow] + numInputRows += 1 val rowKey = keyGenerator(unsafeRow) if (!rowKey.anyNull) { val existingMatchList = hashTable.get(rowKey) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala index 4443455ef11f..ad6362542f2f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} +import org.apache.spark.sql.execution.metric.SQLMetrics /** * :: DeveloperApi :: @@ -35,6 +36,11 @@ case class LeftSemiJoinBNL( extends BinaryNode { // TODO: Override requiredChildDistribution. + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + override def outputPartitioning: Partitioning = streamed.outputPartitioning override def output: Seq[Attribute] = left.output @@ -52,13 +58,21 @@ case class LeftSemiJoinBNL( newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output) protected override def doExecute(): RDD[InternalRow] = { + val numLeftRows = longMetric("numLeftRows") + val numRightRows = longMetric("numRightRows") + val numOutputRows = longMetric("numOutputRows") + val broadcastedRelation = - sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq) + sparkContext.broadcast(broadcast.execute().map { row => + numRightRows += 1 + row.copy() + }.collect().toIndexedSeq) streamed.execute().mapPartitions { streamedIter => val joinedRow = new JoinedRow streamedIter.filter(streamedRow => { + numLeftRows += 1 var i = 0 var matched = false @@ -69,6 +83,9 @@ case class LeftSemiJoinBNL( } i += 1 } + if (matched) { + numOutputRows += 1 + } matched }) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala index 68ccd34d8ed9..18808adaac63 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, Distribution, ClusteredDistribution} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} +import org.apache.spark.sql.execution.metric.SQLMetrics /** * :: DeveloperApi :: @@ -37,19 +38,28 @@ case class LeftSemiJoinHash( right: SparkPlan, condition: Option[Expression]) extends BinaryNode with HashSemiJoin { + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil protected override def doExecute(): RDD[InternalRow] = { + val numLeftRows = longMetric("numLeftRows") + val numRightRows = longMetric("numRightRows") + val numOutputRows = longMetric("numOutputRows") + right.execute().zipPartitions(left.execute()) { (buildIter, streamIter) => if (condition.isEmpty) { - val hashSet = buildKeyHashSet(buildIter) - hashSemiJoin(streamIter, hashSet) + val hashSet = buildKeyHashSet(buildIter, numRightRows) + hashSemiJoin(streamIter, numLeftRows, hashSet, numOutputRows) } else { - val hashRelation = HashedRelation(buildIter, rightKeyGenerator) - hashSemiJoin(streamIter, hashRelation) + val hashRelation = HashedRelation(buildIter, numRightRows, rightKeyGenerator) + hashSemiJoin(streamIter, numLeftRows, hashRelation, numOutputRows) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala index fc6efe87bceb..fc8c9439a6f0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} +import org.apache.spark.sql.execution.metric.SQLMetrics /** * :: DeveloperApi :: @@ -38,6 +39,11 @@ case class ShuffledHashJoin( right: SparkPlan) extends BinaryNode with HashJoin { + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + override def outputPartitioning: Partitioning = PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning)) @@ -45,9 +51,15 @@ case class ShuffledHashJoin( ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil protected override def doExecute(): RDD[InternalRow] = { + val (numBuildRows, numStreamedRows) = buildSide match { + case BuildLeft => (longMetric("numLeftRows"), longMetric("numRightRows")) + case BuildRight => (longMetric("numRightRows"), longMetric("numLeftRows")) + } + val numOutputRows = longMetric("numOutputRows") + buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) => - val hashed = HashedRelation(buildIter, buildSideKeyGenerator) - hashJoin(streamIter, hashed) + val hashed = HashedRelation(buildIter, numBuildRows, buildSideKeyGenerator) + hashJoin(streamIter, numStreamedRows, hashed, numOutputRows) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala index eee8ad800f98..ed282f98b7d7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} +import org.apache.spark.sql.execution.metric.SQLMetrics /** * :: DeveloperApi :: @@ -41,6 +42,11 @@ case class ShuffledHashOuterJoin( left: SparkPlan, right: SparkPlan) extends BinaryNode with HashOuterJoin { + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil @@ -53,37 +59,48 @@ case class ShuffledHashOuterJoin( } protected override def doExecute(): RDD[InternalRow] = { + val numLeftRows = longMetric("numLeftRows") + val numRightRows = longMetric("numRightRows") + val numOutputRows = longMetric("numOutputRows") + val joinedRow = new JoinedRow() left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) => // TODO this probably can be replaced by external sort (sort merged join?) joinType match { case LeftOuter => - val hashed = HashedRelation(rightIter, buildKeyGenerator) + val hashed = HashedRelation(rightIter, numRightRows, buildKeyGenerator) val keyGenerator = streamedKeyGenerator + val resultProj = resultProjection leftIter.flatMap( currentRow => { + numLeftRows += 1 val rowKey = keyGenerator(currentRow) joinedRow.withLeft(currentRow) - leftOuterIterator(rowKey, joinedRow, hashed.get(rowKey)) + leftOuterIterator(rowKey, joinedRow, hashed.get(rowKey), resultProj, numOutputRows) }) case RightOuter => - val hashed = HashedRelation(leftIter, buildKeyGenerator) + val hashed = HashedRelation(leftIter, numLeftRows, buildKeyGenerator) val keyGenerator = streamedKeyGenerator + val resultProj = resultProjection rightIter.flatMap ( currentRow => { + numRightRows += 1 val rowKey = keyGenerator(currentRow) joinedRow.withRight(currentRow) - rightOuterIterator(rowKey, hashed.get(rowKey), joinedRow) + rightOuterIterator(rowKey, hashed.get(rowKey), joinedRow, resultProj, numOutputRows) }) case FullOuter => // TODO(davies): use UnsafeRow - val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output)) - val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output)) + val leftHashTable = + buildHashTable(leftIter, numLeftRows, newProjection(leftKeys, left.output)) + val rightHashTable = + buildHashTable(rightIter, numRightRows, newProjection(rightKeys, right.output)) (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key => fullOuterIterator(key, leftHashTable.getOrElse(key, EMPTY_LIST), rightHashTable.getOrElse(key, EMPTY_LIST), - joinedRow) + joinedRow, + numOutputRows) } case x => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala index 41be78afd37e..6b7322671d6b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala @@ -17,15 +17,15 @@ package org.apache.spark.sql.execution.joins -import java.util.NoSuchElementException +import scala.collection.mutable.ArrayBuffer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical._ -import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} -import org.apache.spark.util.collection.CompactBuffer +import org.apache.spark.sql.execution.{BinaryNode, RowIterator, SparkPlan} +import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetrics} /** * :: DeveloperApi :: @@ -38,6 +38,11 @@ case class SortMergeJoin( left: SparkPlan, right: SparkPlan) extends BinaryNode { + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + override def output: Seq[Attribute] = left.output ++ right.output override def outputPartitioning: Partitioning = @@ -46,9 +51,6 @@ case class SortMergeJoin( override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil - // this is to manually construct an ordering that can be used to compare keys from both sides - private val keyOrdering: RowOrdering = RowOrdering.forSchema(leftKeys.map(_.dataType)) - override def outputOrdering: Seq[SortOrder] = requiredOrders(leftKeys) override def requiredChildOrdering: Seq[Seq[SortOrder]] = @@ -57,113 +59,276 @@ case class SortMergeJoin( @transient protected lazy val leftKeyGenerator = newProjection(leftKeys, left.output) @transient protected lazy val rightKeyGenerator = newProjection(rightKeys, right.output) - private def requiredOrders(keys: Seq[Expression]): Seq[SortOrder] = + protected[this] def isUnsafeMode: Boolean = { + (codegenEnabled && unsafeEnabled + && UnsafeProjection.canSupport(leftKeys) + && UnsafeProjection.canSupport(rightKeys) + && UnsafeProjection.canSupport(schema)) + } + + override def outputsUnsafeRows: Boolean = isUnsafeMode + override def canProcessUnsafeRows: Boolean = isUnsafeMode + override def canProcessSafeRows: Boolean = !isUnsafeMode + + private def requiredOrders(keys: Seq[Expression]): Seq[SortOrder] = { + // This must be ascending in order to agree with the `keyOrdering` defined in `doExecute()`. keys.map(SortOrder(_, Ascending)) + } protected override def doExecute(): RDD[InternalRow] = { - val leftResults = left.execute().map(_.copy()) - val rightResults = right.execute().map(_.copy()) + val numLeftRows = longMetric("numLeftRows") + val numRightRows = longMetric("numRightRows") + val numOutputRows = longMetric("numOutputRows") - leftResults.zipPartitions(rightResults) { (leftIter, rightIter) => - new Iterator[InternalRow] { - // Mutable per row objects. + left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) => + new RowIterator { + // An ordering that can be used to compare keys from both sides. + private[this] val keyOrdering = newNaturalAscendingOrdering(leftKeys.map(_.dataType)) + private[this] var currentLeftRow: InternalRow = _ + private[this] var currentRightMatches: ArrayBuffer[InternalRow] = _ + private[this] var currentMatchIdx: Int = -1 + private[this] val smjScanner = new SortMergeJoinScanner( + leftKeyGenerator, + rightKeyGenerator, + keyOrdering, + RowIterator.fromScala(leftIter), + numLeftRows, + RowIterator.fromScala(rightIter), + numRightRows + ) private[this] val joinRow = new JoinedRow - private[this] var leftElement: InternalRow = _ - private[this] var rightElement: InternalRow = _ - private[this] var leftKey: InternalRow = _ - private[this] var rightKey: InternalRow = _ - private[this] var rightMatches: CompactBuffer[InternalRow] = _ - private[this] var rightPosition: Int = -1 - private[this] var stop: Boolean = false - private[this] var matchKey: InternalRow = _ - - // initialize iterator - initialize() - - override final def hasNext: Boolean = nextMatchingPair() - - override final def next(): InternalRow = { - if (hasNext) { - // we are using the buffered right rows and run down left iterator - val joinedRow = joinRow(leftElement, rightMatches(rightPosition)) - rightPosition += 1 - if (rightPosition >= rightMatches.size) { - rightPosition = 0 - fetchLeft() - if (leftElement == null || keyOrdering.compare(leftKey, matchKey) != 0) { - stop = false - rightMatches = null - } - } - joinedRow + private[this] val resultProjection: (InternalRow) => InternalRow = { + if (isUnsafeMode) { + UnsafeProjection.create(schema) } else { - // no more result - throw new NoSuchElementException + identity[InternalRow] } } - private def fetchLeft() = { - if (leftIter.hasNext) { - leftElement = leftIter.next() - leftKey = leftKeyGenerator(leftElement) - } else { - leftElement = null + override def advanceNext(): Boolean = { + if (currentMatchIdx == -1 || currentMatchIdx == currentRightMatches.length) { + if (smjScanner.findNextInnerJoinRows()) { + currentRightMatches = smjScanner.getBufferedMatches + currentLeftRow = smjScanner.getStreamedRow + currentMatchIdx = 0 + } else { + currentRightMatches = null + currentLeftRow = null + currentMatchIdx = -1 + } } - } - - private def fetchRight() = { - if (rightIter.hasNext) { - rightElement = rightIter.next() - rightKey = rightKeyGenerator(rightElement) + if (currentLeftRow != null) { + joinRow(currentLeftRow, currentRightMatches(currentMatchIdx)) + currentMatchIdx += 1 + numOutputRows += 1 + true } else { - rightElement = null + false } } - private def initialize() = { - fetchLeft() - fetchRight() + override def getRow: InternalRow = resultProjection(joinRow) + }.toScala + } + } +} + +/** + * Helper class that is used to implement [[SortMergeJoin]] and [[SortMergeOuterJoin]]. + * + * To perform an inner (outer) join, users of this class call [[findNextInnerJoinRows()]] + * ([[findNextOuterJoinRows()]]), which returns `true` if a result has been produced and `false` + * otherwise. If a result has been produced, then the caller may call [[getStreamedRow]] to return + * the matching row from the streamed input and may call [[getBufferedMatches]] to return the + * sequence of matching rows from the buffered input (in the case of an outer join, this will return + * an empty sequence if there are no matches from the buffered input). For efficiency, both of these + * methods return mutable objects which are re-used across calls to the `findNext*JoinRows()` + * methods. + * + * @param streamedKeyGenerator a projection that produces join keys from the streamed input. + * @param bufferedKeyGenerator a projection that produces join keys from the buffered input. + * @param keyOrdering an ordering which can be used to compare join keys. + * @param streamedIter an input whose rows will be streamed. + * @param bufferedIter an input whose rows will be buffered to construct sequences of rows that + * have the same join key. + */ +private[joins] class SortMergeJoinScanner( + streamedKeyGenerator: Projection, + bufferedKeyGenerator: Projection, + keyOrdering: Ordering[InternalRow], + streamedIter: RowIterator, + numStreamedRows: LongSQLMetric, + bufferedIter: RowIterator, + numBufferedRows: LongSQLMetric) { + private[this] var streamedRow: InternalRow = _ + private[this] var streamedRowKey: InternalRow = _ + private[this] var bufferedRow: InternalRow = _ + // Note: this is guaranteed to never have any null columns: + private[this] var bufferedRowKey: InternalRow = _ + /** + * The join key for the rows buffered in `bufferedMatches`, or null if `bufferedMatches` is empty + */ + private[this] var matchJoinKey: InternalRow = _ + /** Buffered rows from the buffered side of the join. This is empty if there are no matches. */ + private[this] val bufferedMatches: ArrayBuffer[InternalRow] = new ArrayBuffer[InternalRow] + + // Initialization (note: do _not_ want to advance streamed here). + advancedBufferedToRowWithNullFreeJoinKey() + + // --- Public methods --------------------------------------------------------------------------- + + def getStreamedRow: InternalRow = streamedRow + + def getBufferedMatches: ArrayBuffer[InternalRow] = bufferedMatches + + /** + * Advances both input iterators, stopping when we have found rows with matching join keys. + * @return true if matching rows have been found and false otherwise. If this returns true, then + * [[getStreamedRow]] and [[getBufferedMatches]] can be called to construct the join + * results. + */ + final def findNextInnerJoinRows(): Boolean = { + while (advancedStreamed() && streamedRowKey.anyNull) { + // Advance the streamed side of the join until we find the next row whose join key contains + // no nulls or we hit the end of the streamed iterator. + } + if (streamedRow == null) { + // We have consumed the entire streamed iterator, so there can be no more matches. + matchJoinKey = null + bufferedMatches.clear() + false + } else if (matchJoinKey != null && keyOrdering.compare(streamedRowKey, matchJoinKey) == 0) { + // The new streamed row has the same join key as the previous row, so return the same matches. + true + } else if (bufferedRow == null) { + // The streamed row's join key does not match the current batch of buffered rows and there are + // no more rows to read from the buffered iterator, so there can be no more matches. + matchJoinKey = null + bufferedMatches.clear() + false + } else { + // Advance both the streamed and buffered iterators to find the next pair of matching rows. + var comp = keyOrdering.compare(streamedRowKey, bufferedRowKey) + do { + if (streamedRowKey.anyNull) { + advancedStreamed() + } else { + assert(!bufferedRowKey.anyNull) + comp = keyOrdering.compare(streamedRowKey, bufferedRowKey) + if (comp > 0) advancedBufferedToRowWithNullFreeJoinKey() + else if (comp < 0) advancedStreamed() } + } while (streamedRow != null && bufferedRow != null && comp != 0) + if (streamedRow == null || bufferedRow == null) { + // We have either hit the end of one of the iterators, so there can be no more matches. + matchJoinKey = null + bufferedMatches.clear() + false + } else { + // The streamed row's join key matches the current buffered row's join, so walk through the + // buffered iterator to buffer the rest of the matching rows. + assert(comp == 0) + bufferMatchingRows() + true + } + } + } - /** - * Searches the right iterator for the next rows that have matches in left side, and store - * them in a buffer. - * - * @return true if the search is successful, and false if the right iterator runs out of - * tuples. - */ - private def nextMatchingPair(): Boolean = { - if (!stop && rightElement != null) { - // run both side to get the first match pair - while (!stop && leftElement != null && rightElement != null) { - val comparing = keyOrdering.compare(leftKey, rightKey) - // for inner join, we need to filter those null keys - stop = comparing == 0 && !leftKey.anyNull - if (comparing > 0 || rightKey.anyNull) { - fetchRight() - } else if (comparing < 0 || leftKey.anyNull) { - fetchLeft() - } - } - rightMatches = new CompactBuffer[InternalRow]() - if (stop) { - stop = false - // iterate the right side to buffer all rows that matches - // as the records should be ordered, exit when we meet the first that not match - while (!stop && rightElement != null) { - rightMatches += rightElement - fetchRight() - stop = keyOrdering.compare(leftKey, rightKey) != 0 - } - if (rightMatches.size > 0) { - rightPosition = 0 - matchKey = leftKey - } - } + /** + * Advances the streamed input iterator and buffers all rows from the buffered input that + * have matching keys. + * @return true if the streamed iterator returned a row, false otherwise. If this returns true, + * then [getStreamedRow and [[getBufferedMatches]] can be called to produce the outer + * join results. + */ + final def findNextOuterJoinRows(): Boolean = { + if (!advancedStreamed()) { + // We have consumed the entire streamed iterator, so there can be no more matches. + matchJoinKey = null + bufferedMatches.clear() + false + } else { + if (matchJoinKey != null && keyOrdering.compare(streamedRowKey, matchJoinKey) == 0) { + // Matches the current group, so do nothing. + } else { + // The streamed row does not match the current group. + matchJoinKey = null + bufferedMatches.clear() + if (bufferedRow != null && !streamedRowKey.anyNull) { + // The buffered iterator could still contain matching rows, so we'll need to walk through + // it until we either find matches or pass where they would be found. + var comp = 1 + do { + comp = keyOrdering.compare(streamedRowKey, bufferedRowKey) + } while (comp > 0 && advancedBufferedToRowWithNullFreeJoinKey()) + if (comp == 0) { + // We have found matches, so buffer them (this updates matchJoinKey) + bufferMatchingRows() + } else { + // We have overshot the position where the row would be found, hence no matches. } - rightMatches != null && rightMatches.size > 0 } } + // If there is a streamed input then we always return true + true } } + + // --- Private methods -------------------------------------------------------------------------- + + /** + * Advance the streamed iterator and compute the new row's join key. + * @return true if the streamed iterator returned a row and false otherwise. + */ + private def advancedStreamed(): Boolean = { + if (streamedIter.advanceNext()) { + streamedRow = streamedIter.getRow + streamedRowKey = streamedKeyGenerator(streamedRow) + numStreamedRows += 1 + true + } else { + streamedRow = null + streamedRowKey = null + false + } + } + + /** + * Advance the buffered iterator until we find a row with join key that does not contain nulls. + * @return true if the buffered iterator returned a row and false otherwise. + */ + private def advancedBufferedToRowWithNullFreeJoinKey(): Boolean = { + var foundRow: Boolean = false + while (!foundRow && bufferedIter.advanceNext()) { + bufferedRow = bufferedIter.getRow + bufferedRowKey = bufferedKeyGenerator(bufferedRow) + numBufferedRows += 1 + foundRow = !bufferedRowKey.anyNull + } + if (!foundRow) { + bufferedRow = null + bufferedRowKey = null + false + } else { + true + } + } + + /** + * Called when the streamed and buffered join keys match in order to buffer the matching rows. + */ + private def bufferMatchingRows(): Unit = { + assert(streamedRowKey != null) + assert(!streamedRowKey.anyNull) + assert(bufferedRowKey != null) + assert(!bufferedRowKey.anyNull) + assert(keyOrdering.compare(streamedRowKey, bufferedRowKey) == 0) + // This join key may have been produced by a mutable projection, so we need to make a copy: + matchJoinKey = streamedRowKey.copy() + bufferedMatches.clear() + do { + bufferedMatches += bufferedRow.copy() // need to copy mutable rows before buffering them + advancedBufferedToRowWithNullFreeJoinKey() + } while (bufferedRow != null && keyOrdering.compare(streamedRowKey, bufferedRowKey) == 0) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala new file mode 100644 index 000000000000..dea9e5e580a1 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.joins + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.{JoinType, LeftOuter, RightOuter} +import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.execution.{BinaryNode, RowIterator, SparkPlan} +import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetrics} + +/** + * :: DeveloperApi :: + * Performs an sort merge outer join of two child relations. + * + * Note: this does not support full outer join yet; see SPARK-9730 for progress on this. + */ +@DeveloperApi +case class SortMergeOuterJoin( + leftKeys: Seq[Expression], + rightKeys: Seq[Expression], + joinType: JoinType, + condition: Option[Expression], + left: SparkPlan, + right: SparkPlan) extends BinaryNode { + + override private[sql] lazy val metrics = Map( + "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), + "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), + "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) + + override def output: Seq[Attribute] = { + joinType match { + case LeftOuter => + left.output ++ right.output.map(_.withNullability(true)) + case RightOuter => + left.output.map(_.withNullability(true)) ++ right.output + case x => + throw new IllegalArgumentException( + s"${getClass.getSimpleName} should not take $x as the JoinType") + } + } + + override def outputPartitioning: Partitioning = joinType match { + // For left and right outer joins, the output is partitioned by the streamed input's join keys. + case LeftOuter => left.outputPartitioning + case RightOuter => right.outputPartitioning + case x => + throw new IllegalArgumentException( + s"${getClass.getSimpleName} should not take $x as the JoinType") + } + + override def outputOrdering: Seq[SortOrder] = joinType match { + // For left and right outer joins, the output is ordered by the streamed input's join keys. + case LeftOuter => requiredOrders(leftKeys) + case RightOuter => requiredOrders(rightKeys) + case x => throw new IllegalArgumentException( + s"SortMergeOuterJoin should not take $x as the JoinType") + } + + override def requiredChildDistribution: Seq[Distribution] = + ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil + + override def requiredChildOrdering: Seq[Seq[SortOrder]] = + requiredOrders(leftKeys) :: requiredOrders(rightKeys) :: Nil + + private def requiredOrders(keys: Seq[Expression]): Seq[SortOrder] = { + // This must be ascending in order to agree with the `keyOrdering` defined in `doExecute()`. + keys.map(SortOrder(_, Ascending)) + } + + private def isUnsafeMode: Boolean = { + (codegenEnabled && unsafeEnabled + && UnsafeProjection.canSupport(leftKeys) + && UnsafeProjection.canSupport(rightKeys) + && UnsafeProjection.canSupport(schema)) + } + + override def outputsUnsafeRows: Boolean = isUnsafeMode + override def canProcessUnsafeRows: Boolean = isUnsafeMode + override def canProcessSafeRows: Boolean = !isUnsafeMode + + private def createLeftKeyGenerator(): Projection = { + if (isUnsafeMode) { + UnsafeProjection.create(leftKeys, left.output) + } else { + newProjection(leftKeys, left.output) + } + } + + private def createRightKeyGenerator(): Projection = { + if (isUnsafeMode) { + UnsafeProjection.create(rightKeys, right.output) + } else { + newProjection(rightKeys, right.output) + } + } + + override def doExecute(): RDD[InternalRow] = { + val numLeftRows = longMetric("numLeftRows") + val numRightRows = longMetric("numRightRows") + val numOutputRows = longMetric("numOutputRows") + + left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) => + // An ordering that can be used to compare keys from both sides. + val keyOrdering = newNaturalAscendingOrdering(leftKeys.map(_.dataType)) + val boundCondition: (InternalRow) => Boolean = { + condition.map { cond => + newPredicate(cond, left.output ++ right.output) + }.getOrElse { + (r: InternalRow) => true + } + } + val resultProj: InternalRow => InternalRow = { + if (isUnsafeMode) { + UnsafeProjection.create(schema) + } else { + identity[InternalRow] + } + } + + joinType match { + case LeftOuter => + val smjScanner = new SortMergeJoinScanner( + streamedKeyGenerator = createLeftKeyGenerator(), + bufferedKeyGenerator = createRightKeyGenerator(), + keyOrdering, + streamedIter = RowIterator.fromScala(leftIter), + numLeftRows, + bufferedIter = RowIterator.fromScala(rightIter), + numRightRows + ) + val rightNullRow = new GenericInternalRow(right.output.length) + new LeftOuterIterator( + smjScanner, rightNullRow, boundCondition, resultProj, numOutputRows).toScala + + case RightOuter => + val smjScanner = new SortMergeJoinScanner( + streamedKeyGenerator = createRightKeyGenerator(), + bufferedKeyGenerator = createLeftKeyGenerator(), + keyOrdering, + streamedIter = RowIterator.fromScala(rightIter), + numRightRows, + bufferedIter = RowIterator.fromScala(leftIter), + numLeftRows + ) + val leftNullRow = new GenericInternalRow(left.output.length) + new RightOuterIterator( + smjScanner, leftNullRow, boundCondition, resultProj, numOutputRows).toScala + + case x => + throw new IllegalArgumentException( + s"SortMergeOuterJoin should not take $x as the JoinType") + } + } + } +} + + +private class LeftOuterIterator( + smjScanner: SortMergeJoinScanner, + rightNullRow: InternalRow, + boundCondition: InternalRow => Boolean, + resultProj: InternalRow => InternalRow, + numRows: LongSQLMetric + ) extends RowIterator { + private[this] val joinedRow: JoinedRow = new JoinedRow() + private[this] var rightIdx: Int = 0 + assert(smjScanner.getBufferedMatches.length == 0) + + private def advanceLeft(): Boolean = { + rightIdx = 0 + if (smjScanner.findNextOuterJoinRows()) { + joinedRow.withLeft(smjScanner.getStreamedRow) + if (smjScanner.getBufferedMatches.isEmpty) { + // There are no matching right rows, so return nulls for the right row + joinedRow.withRight(rightNullRow) + } else { + // Find the next row from the right input that satisfied the bound condition + if (!advanceRightUntilBoundConditionSatisfied()) { + joinedRow.withRight(rightNullRow) + } + } + true + } else { + // Left input has been exhausted + false + } + } + + private def advanceRightUntilBoundConditionSatisfied(): Boolean = { + var foundMatch: Boolean = false + while (!foundMatch && rightIdx < smjScanner.getBufferedMatches.length) { + foundMatch = boundCondition(joinedRow.withRight(smjScanner.getBufferedMatches(rightIdx))) + rightIdx += 1 + } + foundMatch + } + + override def advanceNext(): Boolean = { + val r = advanceRightUntilBoundConditionSatisfied() || advanceLeft() + if (r) numRows += 1 + r + } + + override def getRow: InternalRow = resultProj(joinedRow) +} + +private class RightOuterIterator( + smjScanner: SortMergeJoinScanner, + leftNullRow: InternalRow, + boundCondition: InternalRow => Boolean, + resultProj: InternalRow => InternalRow, + numRows: LongSQLMetric + ) extends RowIterator { + private[this] val joinedRow: JoinedRow = new JoinedRow() + private[this] var leftIdx: Int = 0 + assert(smjScanner.getBufferedMatches.length == 0) + + private def advanceRight(): Boolean = { + leftIdx = 0 + if (smjScanner.findNextOuterJoinRows()) { + joinedRow.withRight(smjScanner.getStreamedRow) + if (smjScanner.getBufferedMatches.isEmpty) { + // There are no matching left rows, so return nulls for the left row + joinedRow.withLeft(leftNullRow) + } else { + // Find the next row from the left input that satisfied the bound condition + if (!advanceLeftUntilBoundConditionSatisfied()) { + joinedRow.withLeft(leftNullRow) + } + } + true + } else { + // Right input has been exhausted + false + } + } + + private def advanceLeftUntilBoundConditionSatisfied(): Boolean = { + var foundMatch: Boolean = false + while (!foundMatch && leftIdx < smjScanner.getBufferedMatches.length) { + foundMatch = boundCondition(joinedRow.withLeft(smjScanner.getBufferedMatches(leftIdx))) + leftIdx += 1 + } + foundMatch + } + + override def advanceNext(): Boolean = { + val r = advanceLeftUntilBoundConditionSatisfied() || advanceRight() + if (r) numRows += 1 + r + } + + override def getRow: InternalRow = resultProj(joinedRow) +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala new file mode 100644 index 000000000000..7a2a98ec18cb --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala @@ -0,0 +1,121 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql.execution.metric + +import org.apache.spark.{Accumulable, AccumulableParam, SparkContext} + +/** + * Create a layer for specialized metric. We cannot add `@specialized` to + * `Accumulable/AccumulableParam` because it will break Java source compatibility. + * + * An implementation of SQLMetric should override `+=` and `add` to avoid boxing. + */ +private[sql] abstract class SQLMetric[R <: SQLMetricValue[T], T]( + name: String, val param: SQLMetricParam[R, T]) + extends Accumulable[R, T](param.zero, param, Some(name), true) + +/** + * Create a layer for specialized metric. We cannot add `@specialized` to + * `Accumulable/AccumulableParam` because it will break Java source compatibility. + */ +private[sql] trait SQLMetricParam[R <: SQLMetricValue[T], T] extends AccumulableParam[R, T] { + + def zero: R +} + +/** + * Create a layer for specialized metric. We cannot add `@specialized` to + * `Accumulable/AccumulableParam` because it will break Java source compatibility. + */ +private[sql] trait SQLMetricValue[T] extends Serializable { + + def value: T + + override def toString: String = value.toString +} + +/** + * A wrapper of Long to avoid boxing and unboxing when using Accumulator + */ +private[sql] class LongSQLMetricValue(private var _value : Long) extends SQLMetricValue[Long] { + + def add(incr: Long): LongSQLMetricValue = { + _value += incr + this + } + + // Although there is a boxing here, it's fine because it's only called in SQLListener + override def value: Long = _value +} + +/** + * A wrapper of Int to avoid boxing and unboxing when using Accumulator + */ +private[sql] class IntSQLMetricValue(private var _value: Int) extends SQLMetricValue[Int] { + + def add(term: Int): IntSQLMetricValue = { + _value += term + this + } + + // Although there is a boxing here, it's fine because it's only called in SQLListener + override def value: Int = _value +} + +/** + * A specialized long Accumulable to avoid boxing and unboxing when using Accumulator's + * `+=` and `add`. + */ +private[sql] class LongSQLMetric private[metric](name: String) + extends SQLMetric[LongSQLMetricValue, Long](name, LongSQLMetricParam) { + + override def +=(term: Long): Unit = { + localValue.add(term) + } + + override def add(term: Long): Unit = { + localValue.add(term) + } +} + +private object LongSQLMetricParam extends SQLMetricParam[LongSQLMetricValue, Long] { + + override def addAccumulator(r: LongSQLMetricValue, t: Long): LongSQLMetricValue = r.add(t) + + override def addInPlace(r1: LongSQLMetricValue, r2: LongSQLMetricValue): LongSQLMetricValue = + r1.add(r2.value) + + override def zero(initialValue: LongSQLMetricValue): LongSQLMetricValue = zero + + override def zero: LongSQLMetricValue = new LongSQLMetricValue(0L) +} + +private[sql] object SQLMetrics { + + def createLongMetric(sc: SparkContext, name: String): LongSQLMetric = { + val acc = new LongSQLMetric(name) + sc.cleaner.foreach(_.registerAccumulatorForCleanup(acc)) + acc + } + + /** + * A metric that its value will be ignored. Use this one when we need a metric parameter but don't + * care about the value. + */ + val nullLongMetric = new LongSQLMetric("null") +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/package.scala index 66237f8f1314..28fa231e722d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/package.scala @@ -18,12 +18,6 @@ package org.apache.spark.sql /** - * :: DeveloperApi :: - * An execution engine for relational query plans that runs on top Spark and returns RDDs. - * - * Note that the operators in this package are created automatically by a query planner using a - * [[SQLContext]] and are not intended to be used directly by end users of Spark SQL. They are - * documented here in order to make it easier for others to understand the performance - * characteristics of query plans that are generated by Spark SQL. + * The physical execution component of Spark SQL. Note that this is a private package. */ package object execution diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala index aade2e769ccd..59f8b079ab33 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala @@ -21,7 +21,6 @@ import java.io.OutputStream import java.util.{List => JList, Map => JMap} import scala.collection.JavaConversions._ -import scala.collection.JavaConverters._ import net.razorvine.pickle._ @@ -66,7 +65,7 @@ private[spark] case class PythonUDF( * multiple child operators. */ private[spark] object ExtractPythonUDFs extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transform { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { // Skip EvaluatePython nodes. case plan: EvaluatePython => plan @@ -182,7 +181,7 @@ object EvaluatePython { case (c: Double, DoubleType) => c - case (c: java.math.BigDecimal, dt: DecimalType) => Decimal(c) + case (c: java.math.BigDecimal, dt: DecimalType) => Decimal(c, dt.precision, dt.scale) case (c: Int, DateType) => c diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala index 29f3beb3cb3c..855555dd1d4c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala @@ -21,6 +21,7 @@ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.rules.Rule /** @@ -33,6 +34,8 @@ case class ConvertToUnsafe(child: SparkPlan) extends UnaryNode { require(UnsafeProjection.canSupport(child.schema), s"Cannot convert ${child.schema} to Unsafe") override def output: Seq[Attribute] = child.output + override def outputPartitioning: Partitioning = child.outputPartitioning + override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = false override def canProcessSafeRows: Boolean = true @@ -51,6 +54,8 @@ case class ConvertToUnsafe(child: SparkPlan) extends UnaryNode { @DeveloperApi case class ConvertToSafe(child: SparkPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output + override def outputPartitioning: Partitioning = child.outputPartitioning + override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def outputsUnsafeRows: Boolean = false override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala index 92cf328c76cb..40ef7c3b5353 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala @@ -17,14 +17,15 @@ package org.apache.spark.sql.execution -import org.apache.spark.rdd.RDD +import org.apache.spark.rdd.{MapPartitionsWithPreparationRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.physical.{UnspecifiedDistribution, OrderedDistribution, Distribution} +import org.apache.spark.sql.catalyst.plans.physical.{Distribution, OrderedDistribution, UnspecifiedDistribution} import org.apache.spark.sql.types.StructType import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter +import org.apache.spark.{SparkEnv, InternalAccumulator, TaskContext} //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines various sort operators. @@ -76,6 +77,11 @@ case class ExternalSort( val sorter = new ExternalSorter[InternalRow, Null, InternalRow](ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) + val context = TaskContext.get() + context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) + context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) + context.internalMetricsToAccumulators( + InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.peakMemoryUsedBytes) // TODO(marmbrus): The complex type signature below thwarts inference for no reason. CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) @@ -116,8 +122,12 @@ case class TungstenSort( protected override def doExecute(): RDD[InternalRow] = { val schema = child.schema val childOutput = child.output - val pageSize = sparkContext.conf.getSizeAsBytes("spark.buffer.pageSize", "64m") - child.execute().mapPartitions({ iter => + + /** + * Set up the sorter in each partition before computing the parent partition. + * This makes sure our sorter is not starved by other sorters used in the same task. + */ + def preparePartition(): UnsafeExternalRowSorter = { val ordering = newOrdering(sortOrder, childOutput) // The comparator for comparing prefix @@ -132,13 +142,31 @@ case class TungstenSort( } } + val pageSize = SparkEnv.get.shuffleMemoryManager.pageSizeBytes val sorter = new UnsafeExternalRowSorter( schema, ordering, prefixComparator, prefixComputer, pageSize) if (testSpillFrequency > 0) { sorter.setTestSpillFrequency(testSpillFrequency) } - sorter.sort(iter.asInstanceOf[Iterator[UnsafeRow]]) - }, preservesPartitioning = true) + sorter + } + + /** Compute a partition using the sorter already set up previously. */ + def executePartition( + taskContext: TaskContext, + partitionIndex: Int, + sorter: UnsafeExternalRowSorter, + parentIterator: Iterator[InternalRow]): Iterator[InternalRow] = { + val sortedIterator = sorter.sort(parentIterator.asInstanceOf[Iterator[UnsafeRow]]) + taskContext.internalMetricsToAccumulators( + InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.getPeakMemoryUsage) + sortedIterator + } + + // Note: we need to set up the external sorter in each partition before computing + // the parent partition, so we cannot simply use `mapPartitions` here (SPARK-9709). + new MapPartitionsWithPreparationRDD[InternalRow, InternalRow, UnsafeExternalRowSorter]( + child.execute(), preparePartition, executePartition, preservesPartitioning = true) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala index 9329148aa233..db463029aedf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala @@ -20,17 +20,15 @@ package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.Logging -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ -import org.apache.spark.sql.{Column, DataFrame} +import org.apache.spark.sql.{Row, Column, DataFrame} private[sql] object FrequentItems extends Logging { /** A helper class wrapping `MutableMap[Any, Long]` for simplicity. */ private class FreqItemCounter(size: Int) extends Serializable { val baseMap: MutableMap[Any, Long] = MutableMap.empty[Any, Long] - /** * Add a new example to the counts if it exists, otherwise deduct the count * from existing items. @@ -42,9 +40,15 @@ private[sql] object FrequentItems extends Logging { if (baseMap.size < size) { baseMap += key -> count } else { - // TODO: Make this more efficient... A flatMap? - baseMap.retain((k, v) => v > count) - baseMap.transform((k, v) => v - count) + val minCount = baseMap.values.min + val remainder = count - minCount + if (remainder >= 0) { + baseMap += key -> count // something will get kicked out, so we can add this + baseMap.retain((k, v) => v > minCount) + baseMap.transform((k, v) => v - minCount) + } else { + baseMap.transform((k, v) => v - count) + } } } this @@ -90,12 +94,12 @@ private[sql] object FrequentItems extends Logging { (name, originalSchema.fields(index).dataType) }.toArray - val freqItems = df.select(cols.map(Column(_)) : _*).queryExecution.toRdd.aggregate(countMaps)( + val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) - val key = row.get(i, colInfo(i)._2) + val key = row.get(i) thisMap.add(key, 1L) i += 1 } @@ -110,13 +114,13 @@ private[sql] object FrequentItems extends Logging { baseCounts } ) - val justItems = freqItems.map(m => m.baseMap.keys.toArray).map(new GenericArrayData(_)) - val resultRow = InternalRow(justItems : _*) + val justItems = freqItems.map(m => m.baseMap.keys.toArray) + val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes - new DataFrame(df.sqlContext, LocalRelation(schema, Seq(resultRow))) + new DataFrame(df.sqlContext, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala new file mode 100644 index 000000000000..49646a99d68c --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.ui + +import javax.servlet.http.HttpServletRequest + +import scala.collection.mutable +import scala.xml.Node + +import org.apache.commons.lang3.StringEscapeUtils + +import org.apache.spark.Logging +import org.apache.spark.ui.{UIUtils, WebUIPage} + +private[ui] class AllExecutionsPage(parent: SQLTab) extends WebUIPage("") with Logging { + + private val listener = parent.listener + + override def render(request: HttpServletRequest): Seq[Node] = { + val currentTime = System.currentTimeMillis() + val content = listener.synchronized { + val _content = mutable.ListBuffer[Node]() + if (listener.getRunningExecutions.nonEmpty) { + _content ++= + new RunningExecutionTable( + parent, "Running Queries", currentTime, + listener.getRunningExecutions.sortBy(_.submissionTime).reverse).toNodeSeq + } + if (listener.getCompletedExecutions.nonEmpty) { + _content ++= + new CompletedExecutionTable( + parent, "Completed Queries", currentTime, + listener.getCompletedExecutions.sortBy(_.submissionTime).reverse).toNodeSeq + } + if (listener.getFailedExecutions.nonEmpty) { + _content ++= + new FailedExecutionTable( + parent, "Failed Queries", currentTime, + listener.getFailedExecutions.sortBy(_.submissionTime).reverse).toNodeSeq + } + _content + } + UIUtils.headerSparkPage("SQL", content, parent, Some(5000)) + } +} + +private[ui] abstract class ExecutionTable( + parent: SQLTab, + tableId: String, + tableName: String, + currentTime: Long, + executionUIDatas: Seq[SQLExecutionUIData], + showRunningJobs: Boolean, + showSucceededJobs: Boolean, + showFailedJobs: Boolean) { + + protected def baseHeader: Seq[String] = Seq( + "ID", + "Description", + "Submitted", + "Duration") + + protected def header: Seq[String] + + protected def row(currentTime: Long, executionUIData: SQLExecutionUIData): Seq[Node] = { + val submissionTime = executionUIData.submissionTime + val duration = executionUIData.completionTime.getOrElse(currentTime) - submissionTime + + val runningJobs = executionUIData.runningJobs.map { jobId => + {jobId.toString}
    + } + val succeededJobs = executionUIData.succeededJobs.sorted.map { jobId => + {jobId.toString}
    + } + val failedJobs = executionUIData.failedJobs.sorted.map { jobId => + {jobId.toString}
    + } +

+ + + + + {if (showRunningJobs) { + + }} + {if (showSucceededJobs) { + + }} + {if (showFailedJobs) { + + }} + {detailCell(executionUIData.physicalPlanDescription)} + + } + + private def descriptionCell(execution: SQLExecutionUIData): Seq[Node] = { + val details = if (execution.details.nonEmpty) { + + +details + ++ + + } else { + Nil + } + + val desc = { + {execution.description} + } + +
{desc} {details}
+ } + + private def detailCell(physicalPlan: String): Seq[Node] = { + val isMultiline = physicalPlan.indexOf('\n') >= 0 + val summary = StringEscapeUtils.escapeHtml4( + if (isMultiline) { + physicalPlan.substring(0, physicalPlan.indexOf('\n')) + } else { + physicalPlan + }) + val details = if (isMultiline) { + // scalastyle:off + + +details + ++ + + // scalastyle:on + } else { + "" + } + + } + + def toNodeSeq: Seq[Node] = { +
+

{tableName}

+ {UIUtils.listingTable[SQLExecutionUIData]( + header, row(currentTime, _), executionUIDatas, id = Some(tableId))} +
+ } + + private def jobURL(jobId: Long): String = + "%s/jobs/job?id=%s".format(UIUtils.prependBaseUri(parent.basePath), jobId) + + private def executionURL(executionID: Long): String = + s"${UIUtils.prependBaseUri(parent.basePath)}/${parent.prefix}/execution?id=$executionID" +} + +private[ui] class RunningExecutionTable( + parent: SQLTab, + tableName: String, + currentTime: Long, + executionUIDatas: Seq[SQLExecutionUIData]) + extends ExecutionTable( + parent, + "running-execution-table", + tableName, + currentTime, + executionUIDatas, + showRunningJobs = true, + showSucceededJobs = true, + showFailedJobs = true) { + + override protected def header: Seq[String] = + baseHeader ++ Seq("Running Jobs", "Succeeded Jobs", "Failed Jobs", "Detail") +} + +private[ui] class CompletedExecutionTable( + parent: SQLTab, + tableName: String, + currentTime: Long, + executionUIDatas: Seq[SQLExecutionUIData]) + extends ExecutionTable( + parent, + "completed-execution-table", + tableName, + currentTime, + executionUIDatas, + showRunningJobs = false, + showSucceededJobs = true, + showFailedJobs = false) { + + override protected def header: Seq[String] = baseHeader ++ Seq("Jobs", "Detail") +} + +private[ui] class FailedExecutionTable( + parent: SQLTab, + tableName: String, + currentTime: Long, + executionUIDatas: Seq[SQLExecutionUIData]) + extends ExecutionTable( + parent, + "failed-execution-table", + tableName, + currentTime, + executionUIDatas, + showRunningJobs = false, + showSucceededJobs = true, + showFailedJobs = true) { + + override protected def header: Seq[String] = + baseHeader ++ Seq("Succeeded Jobs", "Failed Jobs", "Detail") +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala new file mode 100644 index 000000000000..a4dbd2e1978d --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.ui + +import javax.servlet.http.HttpServletRequest + +import scala.xml.{Node, Unparsed} + +import org.apache.commons.lang3.StringEscapeUtils + +import org.apache.spark.Logging +import org.apache.spark.ui.{UIUtils, WebUIPage} + +private[sql] class ExecutionPage(parent: SQLTab) extends WebUIPage("execution") with Logging { + + private val listener = parent.listener + + override def render(request: HttpServletRequest): Seq[Node] = listener.synchronized { + val parameterExecutionId = request.getParameter("id") + require(parameterExecutionId != null && parameterExecutionId.nonEmpty, + "Missing execution id parameter") + + val executionId = parameterExecutionId.toLong + val content = listener.getExecution(executionId).map { executionUIData => + val currentTime = System.currentTimeMillis() + val duration = + executionUIData.completionTime.getOrElse(currentTime) - executionUIData.submissionTime + + val summary = +
+
    +
  • + Submitted Time: {UIUtils.formatDate(executionUIData.submissionTime)} +
  • +
  • + Duration: {UIUtils.formatDuration(duration)} +
  • + {if (executionUIData.runningJobs.nonEmpty) { +
  • + Running Jobs: + {executionUIData.runningJobs.sorted.map { jobId => + {jobId.toString}  + }} +
  • + }} + {if (executionUIData.succeededJobs.nonEmpty) { +
  • + Succeeded Jobs: + {executionUIData.succeededJobs.sorted.map { jobId => + {jobId.toString}  + }} +
  • + }} + {if (executionUIData.failedJobs.nonEmpty) { +
  • + Failed Jobs: + {executionUIData.failedJobs.sorted.map { jobId => + {jobId.toString}  + }} +
  • + }} +
+
+ + val metrics = listener.getExecutionMetrics(executionId) + + summary ++ + planVisualization(metrics, executionUIData.physicalPlanGraph) ++ + physicalPlanDescription(executionUIData.physicalPlanDescription) + }.getOrElse { +
No information to display for Plan {executionId}
+ } + + UIUtils.headerSparkPage(s"Details for Query $executionId", content, parent, Some(5000)) + } + + + private def planVisualizationResources: Seq[Node] = { + // scalastyle:off + + + + + + // scalastyle:on + } + + private def planVisualization(metrics: Map[Long, Any], graph: SparkPlanGraph): Seq[Node] = { + val metadata = graph.nodes.flatMap { node => + val nodeId = s"plan-meta-data-${node.id}" +
{node.desc}
+ } + +
+
+ + {planVisualizationResources} + +
+ } + + private def jobURL(jobId: Long): String = + "%s/jobs/job?id=%s".format(UIUtils.prependBaseUri(parent.basePath), jobId) + + private def physicalPlanDescription(physicalPlanDescription: String): Seq[Node] = { +
+ + + Details + +
+ + +
+ } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala new file mode 100644 index 000000000000..5779c71f64e9 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.ui + +import scala.collection.mutable + +import com.google.common.annotations.VisibleForTesting + +import org.apache.spark.{JobExecutionStatus, Logging} +import org.apache.spark.executor.TaskMetrics +import org.apache.spark.scheduler._ +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.execution.metric.{SQLMetricParam, SQLMetricValue} + +private[sql] class SQLListener(sqlContext: SQLContext) extends SparkListener with Logging { + + private val retainedExecutions = + sqlContext.sparkContext.conf.getInt("spark.sql.ui.retainedExecutions", 1000) + + private val activeExecutions = mutable.HashMap[Long, SQLExecutionUIData]() + + // Old data in the following fields must be removed in "trimExecutionsIfNecessary". + // If adding new fields, make sure "trimExecutionsIfNecessary" can clean up old data + private val _executionIdToData = mutable.HashMap[Long, SQLExecutionUIData]() + + /** + * Maintain the relation between job id and execution id so that we can get the execution id in + * the "onJobEnd" method. + */ + private val _jobIdToExecutionId = mutable.HashMap[Long, Long]() + + private val _stageIdToStageMetrics = mutable.HashMap[Long, SQLStageMetrics]() + + private val failedExecutions = mutable.ListBuffer[SQLExecutionUIData]() + + private val completedExecutions = mutable.ListBuffer[SQLExecutionUIData]() + + def executionIdToData: Map[Long, SQLExecutionUIData] = synchronized { + _executionIdToData.toMap + } + + def jobIdToExecutionId: Map[Long, Long] = synchronized { + _jobIdToExecutionId.toMap + } + + def stageIdToStageMetrics: Map[Long, SQLStageMetrics] = synchronized { + _stageIdToStageMetrics.toMap + } + + private def trimExecutionsIfNecessary( + executions: mutable.ListBuffer[SQLExecutionUIData]): Unit = { + if (executions.size > retainedExecutions) { + val toRemove = math.max(retainedExecutions / 10, 1) + executions.take(toRemove).foreach { execution => + for (executionUIData <- _executionIdToData.remove(execution.executionId)) { + for (jobId <- executionUIData.jobs.keys) { + _jobIdToExecutionId.remove(jobId) + } + for (stageId <- executionUIData.stages) { + _stageIdToStageMetrics.remove(stageId) + } + } + } + executions.trimStart(toRemove) + } + } + + override def onJobStart(jobStart: SparkListenerJobStart): Unit = { + val executionIdString = jobStart.properties.getProperty(SQLExecution.EXECUTION_ID_KEY) + if (executionIdString == null) { + // This is not a job created by SQL + return + } + val executionId = executionIdString.toLong + val jobId = jobStart.jobId + val stageIds = jobStart.stageIds + + synchronized { + activeExecutions.get(executionId).foreach { executionUIData => + executionUIData.jobs(jobId) = JobExecutionStatus.RUNNING + executionUIData.stages ++= stageIds + stageIds.foreach(stageId => + _stageIdToStageMetrics(stageId) = new SQLStageMetrics(stageAttemptId = 0)) + _jobIdToExecutionId(jobId) = executionId + } + } + } + + override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = synchronized { + val jobId = jobEnd.jobId + for (executionId <- _jobIdToExecutionId.get(jobId); + executionUIData <- _executionIdToData.get(executionId)) { + jobEnd.jobResult match { + case JobSucceeded => executionUIData.jobs(jobId) = JobExecutionStatus.SUCCEEDED + case JobFailed(_) => executionUIData.jobs(jobId) = JobExecutionStatus.FAILED + } + if (executionUIData.completionTime.nonEmpty && !executionUIData.hasRunningJobs) { + // We are the last job of this execution, so mark the execution as finished. Note that + // `onExecutionEnd` also does this, but currently that can be called before `onJobEnd` + // since these are called on different threads. + markExecutionFinished(executionId) + } + } + } + + override def onExecutorMetricsUpdate( + executorMetricsUpdate: SparkListenerExecutorMetricsUpdate): Unit = synchronized { + for ((taskId, stageId, stageAttemptID, metrics) <- executorMetricsUpdate.taskMetrics) { + updateTaskAccumulatorValues(taskId, stageId, stageAttemptID, metrics, finishTask = false) + } + } + + override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized { + val stageId = stageSubmitted.stageInfo.stageId + val stageAttemptId = stageSubmitted.stageInfo.attemptId + // Always override metrics for old stage attempt + _stageIdToStageMetrics(stageId) = new SQLStageMetrics(stageAttemptId) + } + + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized { + updateTaskAccumulatorValues( + taskEnd.taskInfo.taskId, + taskEnd.stageId, + taskEnd.stageAttemptId, + taskEnd.taskMetrics, + finishTask = true) + } + + /** + * Update the accumulator values of a task with the latest metrics for this task. This is called + * every time we receive an executor heartbeat or when a task finishes. + */ + private def updateTaskAccumulatorValues( + taskId: Long, + stageId: Int, + stageAttemptID: Int, + metrics: TaskMetrics, + finishTask: Boolean): Unit = { + if (metrics == null) { + return + } + + _stageIdToStageMetrics.get(stageId) match { + case Some(stageMetrics) => + if (stageAttemptID < stageMetrics.stageAttemptId) { + // A task of an old stage attempt. Because a new stage is submitted, we can ignore it. + } else if (stageAttemptID > stageMetrics.stageAttemptId) { + logWarning(s"A task should not have a higher stageAttemptID ($stageAttemptID) then " + + s"what we have seen (${stageMetrics.stageAttemptId})") + } else { + // TODO We don't know the attemptId. Currently, what we can do is overriding the + // accumulator updates. However, if there are two same task are running, such as + // speculation, the accumulator updates will be overriding by different task attempts, + // the results will be weird. + stageMetrics.taskIdToMetricUpdates.get(taskId) match { + case Some(taskMetrics) => + if (finishTask) { + taskMetrics.finished = true + taskMetrics.accumulatorUpdates = metrics.accumulatorUpdates() + } else if (!taskMetrics.finished) { + taskMetrics.accumulatorUpdates = metrics.accumulatorUpdates() + } else { + // If a task is finished, we should not override with accumulator updates from + // heartbeat reports + } + case None => + // TODO Now just set attemptId to 0. Should fix here when we can get the attempt + // id from SparkListenerExecutorMetricsUpdate + stageMetrics.taskIdToMetricUpdates(taskId) = new SQLTaskMetrics( + attemptId = 0, finished = finishTask, metrics.accumulatorUpdates()) + } + } + case None => + // This execution and its stage have been dropped + } + } + + def onExecutionStart( + executionId: Long, + description: String, + details: String, + physicalPlanDescription: String, + physicalPlanGraph: SparkPlanGraph, + time: Long): Unit = { + val sqlPlanMetrics = physicalPlanGraph.nodes.flatMap { node => + node.metrics.map(metric => metric.accumulatorId -> metric) + } + + val executionUIData = new SQLExecutionUIData(executionId, description, details, + physicalPlanDescription, physicalPlanGraph, sqlPlanMetrics.toMap, time) + synchronized { + activeExecutions(executionId) = executionUIData + _executionIdToData(executionId) = executionUIData + } + } + + def onExecutionEnd(executionId: Long, time: Long): Unit = synchronized { + _executionIdToData.get(executionId).foreach { executionUIData => + executionUIData.completionTime = Some(time) + if (!executionUIData.hasRunningJobs) { + // onExecutionEnd happens after all "onJobEnd"s + // So we should update the execution lists. + markExecutionFinished(executionId) + } else { + // There are some running jobs, onExecutionEnd happens before some "onJobEnd"s. + // Then we don't if the execution is successful, so let the last onJobEnd updates the + // execution lists. + } + } + } + + private def markExecutionFinished(executionId: Long): Unit = { + activeExecutions.remove(executionId).foreach { executionUIData => + if (executionUIData.isFailed) { + failedExecutions += executionUIData + trimExecutionsIfNecessary(failedExecutions) + } else { + completedExecutions += executionUIData + trimExecutionsIfNecessary(completedExecutions) + } + } + } + + def getRunningExecutions: Seq[SQLExecutionUIData] = synchronized { + activeExecutions.values.toSeq + } + + def getFailedExecutions: Seq[SQLExecutionUIData] = synchronized { + failedExecutions + } + + def getCompletedExecutions: Seq[SQLExecutionUIData] = synchronized { + completedExecutions + } + + def getExecution(executionId: Long): Option[SQLExecutionUIData] = synchronized { + _executionIdToData.get(executionId) + } + + /** + * Get all accumulator updates from all tasks which belong to this execution and merge them. + */ + def getExecutionMetrics(executionId: Long): Map[Long, Any] = synchronized { + _executionIdToData.get(executionId) match { + case Some(executionUIData) => + val accumulatorUpdates = { + for (stageId <- executionUIData.stages; + stageMetrics <- _stageIdToStageMetrics.get(stageId).toIterable; + taskMetrics <- stageMetrics.taskIdToMetricUpdates.values; + accumulatorUpdate <- taskMetrics.accumulatorUpdates.toSeq) yield { + accumulatorUpdate + } + }.filter { case (id, _) => executionUIData.accumulatorMetrics.contains(id) } + mergeAccumulatorUpdates(accumulatorUpdates, accumulatorId => + executionUIData.accumulatorMetrics(accumulatorId).metricParam). + mapValues(_.asInstanceOf[SQLMetricValue[_]].value) + case None => + // This execution has been dropped + Map.empty + } + } + + private def mergeAccumulatorUpdates( + accumulatorUpdates: Seq[(Long, Any)], + paramFunc: Long => SQLMetricParam[SQLMetricValue[Any], Any]): Map[Long, Any] = { + accumulatorUpdates.groupBy(_._1).map { case (accumulatorId, values) => + val param = paramFunc(accumulatorId) + (accumulatorId, + values.map(_._2.asInstanceOf[SQLMetricValue[Any]]).foldLeft(param.zero)(param.addInPlace)) + } + } + +} + +/** + * Represent all necessary data for an execution that will be used in Web UI. + */ +private[ui] class SQLExecutionUIData( + val executionId: Long, + val description: String, + val details: String, + val physicalPlanDescription: String, + val physicalPlanGraph: SparkPlanGraph, + val accumulatorMetrics: Map[Long, SQLPlanMetric], + val submissionTime: Long, + var completionTime: Option[Long] = None, + val jobs: mutable.HashMap[Long, JobExecutionStatus] = mutable.HashMap.empty, + val stages: mutable.ArrayBuffer[Int] = mutable.ArrayBuffer()) { + + /** + * Return whether there are running jobs in this execution. + */ + def hasRunningJobs: Boolean = jobs.values.exists(_ == JobExecutionStatus.RUNNING) + + /** + * Return whether there are any failed jobs in this execution. + */ + def isFailed: Boolean = jobs.values.exists(_ == JobExecutionStatus.FAILED) + + def runningJobs: Seq[Long] = + jobs.filter { case (_, status) => status == JobExecutionStatus.RUNNING }.keys.toSeq + + def succeededJobs: Seq[Long] = + jobs.filter { case (_, status) => status == JobExecutionStatus.SUCCEEDED }.keys.toSeq + + def failedJobs: Seq[Long] = + jobs.filter { case (_, status) => status == JobExecutionStatus.FAILED }.keys.toSeq +} + +/** + * Represent a metric in a SQLPlan. + * + * Because we cannot revert our changes for an "Accumulator", we need to maintain accumulator + * updates for each task. So that if a task is retried, we can simply override the old updates with + * the new updates of the new attempt task. Since we cannot add them to accumulator, we need to use + * "AccumulatorParam" to get the aggregation value. + */ +private[ui] case class SQLPlanMetric( + name: String, + accumulatorId: Long, + metricParam: SQLMetricParam[SQLMetricValue[Any], Any]) + +/** + * Store all accumulatorUpdates for all tasks in a Spark stage. + */ +private[ui] class SQLStageMetrics( + val stageAttemptId: Long, + val taskIdToMetricUpdates: mutable.HashMap[Long, SQLTaskMetrics] = mutable.HashMap.empty) + +/** + * Store all accumulatorUpdates for a Spark task. + */ +private[ui] class SQLTaskMetrics( + val attemptId: Long, // TODO not used yet + var finished: Boolean, + var accumulatorUpdates: Map[Long, Any]) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala new file mode 100644 index 000000000000..0b0867f67eb6 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.ui + +import java.util.concurrent.atomic.AtomicInteger + +import org.apache.spark.Logging +import org.apache.spark.sql.SQLContext +import org.apache.spark.ui.{SparkUI, SparkUITab} + +private[sql] class SQLTab(sqlContext: SQLContext, sparkUI: SparkUI) + extends SparkUITab(sparkUI, SQLTab.nextTabName) with Logging { + + val parent = sparkUI + val listener = sqlContext.listener + + attachPage(new AllExecutionsPage(this)) + attachPage(new ExecutionPage(this)) + parent.attachTab(this) + + parent.addStaticHandler(SQLTab.STATIC_RESOURCE_DIR, "/static/sql") +} + +private[sql] object SQLTab { + + private val STATIC_RESOURCE_DIR = "org/apache/spark/sql/execution/ui/static" + + private val nextTabId = new AtomicInteger(0) + + private def nextTabName: String = { + val nextId = nextTabId.getAndIncrement() + if (nextId == 0) "SQL" else s"SQL$nextId" + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala new file mode 100644 index 000000000000..ae3d752dde34 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.ui + +import java.util.concurrent.atomic.AtomicLong + +import scala.collection.mutable + +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.{SQLMetricParam, SQLMetricValue} + +/** + * A graph used for storing information of an executionPlan of DataFrame. + * + * Each graph is defined with a set of nodes and a set of edges. Each node represents a node in the + * SparkPlan tree, and each edge represents a parent-child relationship between two nodes. + */ +private[ui] case class SparkPlanGraph( + nodes: Seq[SparkPlanGraphNode], edges: Seq[SparkPlanGraphEdge]) { + + def makeDotFile(metrics: Map[Long, Any]): String = { + val dotFile = new StringBuilder + dotFile.append("digraph G {\n") + nodes.foreach(node => dotFile.append(node.makeDotNode(metrics) + "\n")) + edges.foreach(edge => dotFile.append(edge.makeDotEdge + "\n")) + dotFile.append("}") + dotFile.toString() + } +} + +private[sql] object SparkPlanGraph { + + /** + * Build a SparkPlanGraph from the root of a SparkPlan tree. + */ + def apply(plan: SparkPlan): SparkPlanGraph = { + val nodeIdGenerator = new AtomicLong(0) + val nodes = mutable.ArrayBuffer[SparkPlanGraphNode]() + val edges = mutable.ArrayBuffer[SparkPlanGraphEdge]() + buildSparkPlanGraphNode(plan, nodeIdGenerator, nodes, edges) + new SparkPlanGraph(nodes, edges) + } + + private def buildSparkPlanGraphNode( + plan: SparkPlan, + nodeIdGenerator: AtomicLong, + nodes: mutable.ArrayBuffer[SparkPlanGraphNode], + edges: mutable.ArrayBuffer[SparkPlanGraphEdge]): SparkPlanGraphNode = { + val metrics = plan.metrics.toSeq.map { case (key, metric) => + SQLPlanMetric(metric.name.getOrElse(key), metric.id, + metric.param.asInstanceOf[SQLMetricParam[SQLMetricValue[Any], Any]]) + } + val node = SparkPlanGraphNode( + nodeIdGenerator.getAndIncrement(), plan.nodeName, plan.simpleString, metrics) + nodes += node + val childrenNodes = plan.children.map( + child => buildSparkPlanGraphNode(child, nodeIdGenerator, nodes, edges)) + for (child <- childrenNodes) { + edges += SparkPlanGraphEdge(child.id, node.id) + } + node + } +} + +/** + * Represent a node in the SparkPlan tree, along with its metrics. + * + * @param id generated by "SparkPlanGraph". There is no duplicate id in a graph + * @param name the name of this SparkPlan node + * @param metrics metrics that this SparkPlan node will track + */ +private[ui] case class SparkPlanGraphNode( + id: Long, name: String, desc: String, metrics: Seq[SQLPlanMetric]) { + + def makeDotNode(metricsValue: Map[Long, Any]): String = { + val values = { + for (metric <- metrics; + value <- metricsValue.get(metric.accumulatorId)) yield { + metric.name + ": " + value + } + } + val label = if (values.isEmpty) { + name + } else { + // If there are metrics, display all metrics in a separate line. We should use an escaped + // "\n" here to follow the dot syntax. + // + // Note: whitespace between two "\n"s is to create an empty line between the name of + // SparkPlan and metrics. If removing it, it won't display the empty line in UI. + name + "\\n \\n" + values.mkString("\\n") + } + s""" $id [label="$label"];""" + } +} + +/** + * Represent an edge in the SparkPlan tree. `fromId` is the parent node id, and `toId` is the child + * node id. + */ +private[ui] case class SparkPlanGraphEdge(fromId: Long, toId: Long) { + + def makeDotEdge: String = s""" $fromId->$toId;\n""" +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala index 278dd438fab4..258afadc7695 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala @@ -17,13 +17,15 @@ package org.apache.spark.sql.expressions -import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, AggregateExpression2} +import org.apache.spark.sql.execution.aggregate.ScalaUDAF +import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.types._ import org.apache.spark.annotation.Experimental /** * :: Experimental :: - * The abstract class for implementing user-defined aggregate functions. + * The base class for implementing user-defined aggregate functions (UDAF). */ @Experimental abstract class UserDefinedAggregateFunction extends Serializable { @@ -64,22 +66,35 @@ abstract class UserDefinedAggregateFunction extends Serializable { /** * The [[DataType]] of the returned value of this [[UserDefinedAggregateFunction]]. */ - def returnDataType: DataType + def dataType: DataType - /** Indicates if this function is deterministic. */ + /** + * Returns true iff this function is deterministic, i.e. given the same input, + * always return the same output. + */ def deterministic: Boolean /** - * Initializes the given aggregation buffer. Initial values set by this method should satisfy - * the condition that when merging two buffers with initial values, the new buffer - * still store initial values. + * Initializes the given aggregation buffer, i.e. the zero value of the aggregation buffer. + * + * The contract should be that applying the merge function on two initial buffers should just + * return the initial buffer itself, i.e. + * `merge(initialBuffer, initialBuffer)` should equal `initialBuffer`. */ def initialize(buffer: MutableAggregationBuffer): Unit - /** Updates the given aggregation buffer `buffer` with new input data from `input`. */ + /** + * Updates the given aggregation buffer `buffer` with new input data from `input`. + * + * This is called once per input row. + */ def update(buffer: MutableAggregationBuffer, input: Row): Unit - /** Merges two aggregation buffers and stores the updated buffer values back to `buffer1`. */ + /** + * Merges two aggregation buffers and stores the updated buffer values back to `buffer1`. + * + * This is called when we merge two partially aggregated data together. + */ def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit /** @@ -87,14 +102,43 @@ abstract class UserDefinedAggregateFunction extends Serializable { * aggregation buffer. */ def evaluate(buffer: Row): Any + + /** + * Creates a [[Column]] for this UDAF using given [[Column]]s as input arguments. + */ + @scala.annotation.varargs + def apply(exprs: Column*): Column = { + val aggregateExpression = + AggregateExpression2( + ScalaUDAF(exprs.map(_.expr), this), + Complete, + isDistinct = false) + Column(aggregateExpression) + } + + /** + * Creates a [[Column]] for this UDAF using the distinct values of the given + * [[Column]]s as input arguments. + */ + @scala.annotation.varargs + def distinct(exprs: Column*): Column = { + val aggregateExpression = + AggregateExpression2( + ScalaUDAF(exprs.map(_.expr), this), + Complete, + isDistinct = true) + Column(aggregateExpression) + } } /** * :: Experimental :: * A [[Row]] representing an mutable aggregation buffer. + * + * This is not meant to be extended outside of Spark. */ @Experimental -trait MutableAggregationBuffer extends Row { +abstract class MutableAggregationBuffer extends Row { /** Update the ith value of this buffer. */ def update(i: Int, value: Any): Unit diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 35958299076c..435e6319a64c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1058,14 +1058,6 @@ object functions { */ def factorial(e: Column): Column = Factorial(e.expr) - /** - * Computes the factorial of the given column. - * - * @group math_funcs - * @since 1.5.0 - */ - def factorial(columnName: String): Column = factorial(Column(columnName)) - /** * Computes the floor of the given value. * @@ -1108,11 +1100,11 @@ object functions { } /** - * Computes hex value of the given column. - * - * @group math_funcs - * @since 1.5.0 - */ + * Computes hex value of the given column. + * + * @group math_funcs + * @since 1.5.0 + */ def hex(column: Column): Column = Hex(column.expr) /** @@ -1366,15 +1358,6 @@ object functions { */ def pmod(dividend: Column, divisor: Column): Column = Pmod(dividend.expr, divisor.expr) - /** - * Returns the positive value of dividend mod divisor. - * - * @group math_funcs - * @since 1.5.0 - */ - def pmod(dividendColName: String, divisorColName: String): Column = - pmod(Column(dividendColName), Column(divisorColName)) - /** * Returns the double value that is closest in value to the argument and * is equal to a mathematical integer. @@ -1401,14 +1384,6 @@ object functions { */ def round(e: Column): Column = round(e.expr, 0) - /** - * Returns the value of the given column rounded to 0 decimal places. - * - * @group math_funcs - * @since 1.5.0 - */ - def round(columnName: String): Column = round(Column(columnName), 0) - /** * Round the value of `e` to `scale` decimal places if `scale` >= 0 * or at integral part when `scale` < 0. @@ -1418,15 +1393,6 @@ object functions { */ def round(e: Column, scale: Int): Column = Round(e.expr, Literal(scale)) - /** - * Round the value of the given column to `scale` decimal places if `scale` >= 0 - * or at integral part when `scale` < 0. - * - * @group math_funcs - * @since 1.5.0 - */ - def round(columnName: String, scale: Int): Column = round(Column(columnName), scale) - /** * Shift the the given value numBits left. If the given value is a long value, this function * will return a long value else it will return an integer value. @@ -1436,16 +1402,6 @@ object functions { */ def shiftLeft(e: Column, numBits: Int): Column = ShiftLeft(e.expr, lit(numBits).expr) - /** - * Shift the the given value numBits left. If the given value is a long value, this function - * will return a long value else it will return an integer value. - * - * @group math_funcs - * @since 1.5.0 - */ - def shiftLeft(columnName: String, numBits: Int): Column = - shiftLeft(Column(columnName), numBits) - /** * Shift the the given value numBits right. If the given value is a long value, it will return * a long value else it will return an integer value. @@ -1455,16 +1411,6 @@ object functions { */ def shiftRight(e: Column, numBits: Int): Column = ShiftRight(e.expr, lit(numBits).expr) - /** - * Unsigned shift the the given value numBits right. If the given value is a long value, - * it will return a long value else it will return an integer value. - * - * @group math_funcs - * @since 1.5.0 - */ - def shiftRightUnsigned(columnName: String, numBits: Int): Column = - shiftRightUnsigned(Column(columnName), numBits) - /** * Unsigned shift the the given value numBits right. If the given value is a long value, * it will return a long value else it will return an integer value. @@ -1475,16 +1421,6 @@ object functions { def shiftRightUnsigned(e: Column, numBits: Int): Column = ShiftRightUnsigned(e.expr, lit(numBits).expr) - /** - * Shift the the given value numBits right. If the given value is a long value, it will return - * a long value else it will return an integer value. - * - * @group math_funcs - * @since 1.5.0 - */ - def shiftRight(columnName: String, numBits: Int): Column = - shiftRight(Column(columnName), numBits) - /** * Computes the signum of the given value. * @@ -1788,14 +1724,6 @@ object functions { new StringLocate(lit(substr).expr, str.expr) } - /** - * Trim the spaces from left end for the specified string value. - * - * @group string_funcs - * @since 1.5.0 - */ - def ltrim(e: Column): Column = StringTrimLeft(e.expr) - /** * Locate the position of the first occurrence of substr in a string column, after position pos. * @@ -1819,6 +1747,14 @@ object functions { StringLPad(str.expr, lit(len).expr, lit(pad).expr) } + /** + * Trim the spaces from left end for the specified string value. + * + * @group string_funcs + * @since 1.5.0 + */ + def ltrim(e: Column): Column = StringTrimLeft(e.expr) + /** * Extract a specific(idx) group identified by a java regex, from the specified string column. * @@ -1927,6 +1863,18 @@ object functions { def substring_index(str: Column, delim: String, count: Int): Column = SubstringIndex(str.expr, lit(delim).expr, lit(count).expr) + /** + * Translate any character in the src by a character in replaceString. + * The characters in replaceString is corresponding to the characters in matchingString. + * The translate will happen when any character in the string matching with the character + * in the matchingString. + * + * @group string_funcs + * @since 1.5.0 + */ + def translate(src: Column, matchingString: String, replaceString: String): Column = + StringTranslate(src.expr, lit(matchingString).expr, lit(replaceString).expr) + /** * Trim the spaces from both ends for the specified string column. * @@ -2184,6 +2132,14 @@ object functions { // Collection functions ////////////////////////////////////////////////////////////////////////////////////////////// + /** + * Returns true if the array contain the value + * @group collection_funcs + * @since 1.5.0 + */ + def array_contains(column: Column, value: Any): Column = + ArrayContains(column.expr, Literal(value)) + /** * Creates a new row for each element in the given array or map column. * @@ -2556,6 +2512,7 @@ object functions { * @group udf_funcs * @since 1.5.0 */ + @scala.annotation.varargs def callUDF(udfName: String, cols: Column*): Column = { UnresolvedFunction(udfName, cols.map(_.expr), isDistinct = false) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcUtils.scala deleted file mode 100644 index cc918c237192..000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcUtils.scala +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.jdbc - -import java.sql.{Connection, DriverManager} -import java.util.Properties - -import scala.util.Try - -/** - * Util functions for JDBC tables. - */ -private[sql] object JdbcUtils { - - /** - * Establishes a JDBC connection. - */ - def createConnection(url: String, connectionProperties: Properties): Connection = { - DriverManager.getConnection(url, connectionProperties) - } - - /** - * Returns true if the table already exists in the JDBC database. - */ - def tableExists(conn: Connection, table: String): Boolean = { - // Somewhat hacky, but there isn't a good way to identify whether a table exists for all - // SQL database systems, considering "table" could also include the database name. - Try(conn.prepareStatement(s"SELECT 1 FROM $table LIMIT 1").executeQuery().next()).isSuccess - } - - /** - * Drops a table from the JDBC database. - */ - def dropTable(conn: Connection, table: String): Unit = { - conn.prepareStatement(s"DROP TABLE $table").executeUpdate() - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala deleted file mode 100644 index 035e0510080f..000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql - -import java.sql.{Connection, Driver, DriverManager, DriverPropertyInfo, PreparedStatement, SQLFeatureNotSupportedException} -import java.util.Properties - -import scala.collection.mutable - -import org.apache.spark.Logging -import org.apache.spark.sql.types._ -import org.apache.spark.util.Utils - -package object jdbc { - private[sql] object JDBCWriteDetails extends Logging { - /** - * Returns a PreparedStatement that inserts a row into table via conn. - */ - def insertStatement(conn: Connection, table: String, rddSchema: StructType): - PreparedStatement = { - val sql = new StringBuilder(s"INSERT INTO $table VALUES (") - var fieldsLeft = rddSchema.fields.length - while (fieldsLeft > 0) { - sql.append("?") - if (fieldsLeft > 1) sql.append(", ") else sql.append(")") - fieldsLeft = fieldsLeft - 1 - } - conn.prepareStatement(sql.toString) - } - - /** - * Saves a partition of a DataFrame to the JDBC database. This is done in - * a single database transaction in order to avoid repeatedly inserting - * data as much as possible. - * - * It is still theoretically possible for rows in a DataFrame to be - * inserted into the database more than once if a stage somehow fails after - * the commit occurs but before the stage can return successfully. - * - * This is not a closure inside saveTable() because apparently cosmetic - * implementation changes elsewhere might easily render such a closure - * non-Serializable. Instead, we explicitly close over all variables that - * are used. - */ - def savePartition( - getConnection: () => Connection, - table: String, - iterator: Iterator[Row], - rddSchema: StructType, - nullTypes: Array[Int]): Iterator[Byte] = { - val conn = getConnection() - var committed = false - try { - conn.setAutoCommit(false) // Everything in the same db transaction. - val stmt = insertStatement(conn, table, rddSchema) - try { - while (iterator.hasNext) { - val row = iterator.next() - val numFields = rddSchema.fields.length - var i = 0 - while (i < numFields) { - if (row.isNullAt(i)) { - stmt.setNull(i + 1, nullTypes(i)) - } else { - rddSchema.fields(i).dataType match { - case IntegerType => stmt.setInt(i + 1, row.getInt(i)) - case LongType => stmt.setLong(i + 1, row.getLong(i)) - case DoubleType => stmt.setDouble(i + 1, row.getDouble(i)) - case FloatType => stmt.setFloat(i + 1, row.getFloat(i)) - case ShortType => stmt.setInt(i + 1, row.getShort(i)) - case ByteType => stmt.setInt(i + 1, row.getByte(i)) - case BooleanType => stmt.setBoolean(i + 1, row.getBoolean(i)) - case StringType => stmt.setString(i + 1, row.getString(i)) - case BinaryType => stmt.setBytes(i + 1, row.getAs[Array[Byte]](i)) - case TimestampType => stmt.setTimestamp(i + 1, row.getAs[java.sql.Timestamp](i)) - case DateType => stmt.setDate(i + 1, row.getAs[java.sql.Date](i)) - case t: DecimalType => stmt.setBigDecimal(i + 1, row.getDecimal(i)) - case _ => throw new IllegalArgumentException( - s"Can't translate non-null value for field $i") - } - } - i = i + 1 - } - stmt.executeUpdate() - } - } finally { - stmt.close() - } - conn.commit() - committed = true - } finally { - if (!committed) { - // The stage must fail. We got here through an exception path, so - // let the exception through unless rollback() or close() want to - // tell the user about another problem. - conn.rollback() - conn.close() - } else { - // The stage must succeed. We cannot propagate any exception close() might throw. - try { - conn.close() - } catch { - case e: Exception => logWarning("Transaction succeeded, but closing failed", e) - } - } - } - Array[Byte]().iterator - } - - /** - * Compute the schema string for this RDD. - */ - def schemaString(df: DataFrame, url: String): String = { - val sb = new StringBuilder() - val dialect = JdbcDialects.get(url) - df.schema.fields foreach { field => { - val name = field.name - val typ: String = - dialect.getJDBCType(field.dataType).map(_.databaseTypeDefinition).getOrElse( - field.dataType match { - case IntegerType => "INTEGER" - case LongType => "BIGINT" - case DoubleType => "DOUBLE PRECISION" - case FloatType => "REAL" - case ShortType => "INTEGER" - case ByteType => "BYTE" - case BooleanType => "BIT(1)" - case StringType => "TEXT" - case BinaryType => "BLOB" - case TimestampType => "TIMESTAMP" - case DateType => "DATE" - case t: DecimalType => s"DECIMAL(${t.precision}},${t.scale}})" - case _ => throw new IllegalArgumentException(s"Don't know how to save $field to JDBC") - }) - val nullable = if (field.nullable) "" else "NOT NULL" - sb.append(s", $name $typ $nullable") - }} - if (sb.length < 2) "" else sb.substring(2) - } - - /** - * Saves the RDD to the database in a single transaction. - */ - def saveTable( - df: DataFrame, - url: String, - table: String, - properties: Properties = new Properties()) { - val dialect = JdbcDialects.get(url) - val nullTypes: Array[Int] = df.schema.fields.map { field => - dialect.getJDBCType(field.dataType).map(_.jdbcNullType).getOrElse( - field.dataType match { - case IntegerType => java.sql.Types.INTEGER - case LongType => java.sql.Types.BIGINT - case DoubleType => java.sql.Types.DOUBLE - case FloatType => java.sql.Types.REAL - case ShortType => java.sql.Types.INTEGER - case ByteType => java.sql.Types.INTEGER - case BooleanType => java.sql.Types.BIT - case StringType => java.sql.Types.CLOB - case BinaryType => java.sql.Types.BLOB - case TimestampType => java.sql.Types.TIMESTAMP - case DateType => java.sql.Types.DATE - case t: DecimalType => java.sql.Types.DECIMAL - case _ => throw new IllegalArgumentException( - s"Can't translate null value for field $field") - }) - } - - val rddSchema = df.schema - val driver: String = DriverRegistry.getDriverClassName(url) - val getConnection: () => Connection = JDBCRDD.getConnector(driver, url, properties) - df.foreachPartition { iterator => - JDBCWriteDetails.savePartition(getConnection, table, iterator, rddSchema, nullTypes) - } - } - - } - - private [sql] class DriverWrapper(val wrapped: Driver) extends Driver { - override def acceptsURL(url: String): Boolean = wrapped.acceptsURL(url) - - override def jdbcCompliant(): Boolean = wrapped.jdbcCompliant() - - override def getPropertyInfo(url: String, info: Properties): Array[DriverPropertyInfo] = { - wrapped.getPropertyInfo(url, info) - } - - override def getMinorVersion: Int = wrapped.getMinorVersion - - def getParentLogger: java.util.logging.Logger = - throw new SQLFeatureNotSupportedException( - s"${this.getClass().getName}.getParentLogger is not yet implemented.") - - override def connect(url: String, info: Properties): Connection = wrapped.connect(url, info) - - override def getMajorVersion: Int = wrapped.getMajorVersion - } - - /** - * java.sql.DriverManager is always loaded by bootstrap classloader, - * so it can't load JDBC drivers accessible by Spark ClassLoader. - * - * To solve the problem, drivers from user-supplied jars are wrapped - * into thin wrapper. - */ - private [sql] object DriverRegistry extends Logging { - - private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty - - def register(className: String): Unit = { - val cls = Utils.getContextOrSparkClassLoader.loadClass(className) - if (cls.getClassLoader == null) { - logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required") - } else if (wrapperMap.get(className).isDefined) { - logTrace(s"Wrapper for $className already exists") - } else { - synchronized { - if (wrapperMap.get(className).isEmpty) { - val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver]) - DriverManager.registerDriver(wrapper) - wrapperMap(className) = wrapper - logTrace(s"Wrapper for $className registered") - } - } - } - } - - def getDriverClassName(url: String): String = DriverManager.getDriver(url) match { - case wrapper: DriverWrapper => wrapper.wrapped.getClass.getCanonicalName - case driver => driver.getClass.getCanonicalName - } - } - -} // package object jdbc diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala deleted file mode 100644 index 562b058414d0..000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.json - -import java.io.IOException - -import org.apache.hadoop.fs.{FileSystem, Path} - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} -import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode} - - -private[sql] class DefaultSource - extends RelationProvider - with SchemaRelationProvider - with CreatableRelationProvider { - - private def checkPath(parameters: Map[String, String]): String = { - parameters.getOrElse("path", sys.error("'path' must be specified for json data.")) - } - - /** Constraints to be imposed on dataframe to be stored. */ - private def checkConstraints(data: DataFrame): Unit = { - if (data.schema.fieldNames.length != data.schema.fieldNames.distinct.length) { - val duplicateColumns = data.schema.fieldNames.groupBy(identity).collect { - case (x, ys) if ys.length > 1 => "\"" + x + "\"" - }.mkString(", ") - throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " + - s"cannot save to JSON format") - } - } - - /** Returns a new base relation with the parameters. */ - override def createRelation( - sqlContext: SQLContext, - parameters: Map[String, String]): BaseRelation = { - val path = checkPath(parameters) - val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0) - - new JSONRelation(path, samplingRatio, None, sqlContext) - } - - /** Returns a new base relation with the given schema and parameters. */ - override def createRelation( - sqlContext: SQLContext, - parameters: Map[String, String], - schema: StructType): BaseRelation = { - val path = checkPath(parameters) - val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0) - - new JSONRelation(path, samplingRatio, Some(schema), sqlContext) - } - - override def createRelation( - sqlContext: SQLContext, - mode: SaveMode, - parameters: Map[String, String], - data: DataFrame): BaseRelation = { - // check if dataframe satisfies the constraints - // before moving forward - checkConstraints(data) - - val path = checkPath(parameters) - val filesystemPath = new Path(path) - val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) - val doSave = if (fs.exists(filesystemPath)) { - mode match { - case SaveMode.Append => - sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}") - case SaveMode.Overwrite => { - JSONRelation.delete(filesystemPath, fs) - true - } - case SaveMode.ErrorIfExists => - sys.error(s"path $path already exists.") - case SaveMode.Ignore => false - } - } else { - true - } - if (doSave) { - // Only save data when the save mode is not ignore. - data.toJSON.saveAsTextFile(path) - } - - createRelation(sqlContext, parameters, data.schema) - } -} - -private[sql] class JSONRelation( - // baseRDD is not immutable with respect to INSERT OVERWRITE - // and so it must be recreated at least as often as the - // underlying inputs are modified. To be safe, a function is - // used instead of a regular RDD value to ensure a fresh RDD is - // recreated for each and every operation. - baseRDD: () => RDD[String], - val path: Option[String], - val samplingRatio: Double, - userSpecifiedSchema: Option[StructType])( - @transient val sqlContext: SQLContext) - extends BaseRelation - with TableScan - with InsertableRelation - with CatalystScan { - - def this( - path: String, - samplingRatio: Double, - userSpecifiedSchema: Option[StructType], - sqlContext: SQLContext) = - this( - () => sqlContext.sparkContext.textFile(path), - Some(path), - samplingRatio, - userSpecifiedSchema)(sqlContext) - - /** Constraints to be imposed on dataframe to be stored. */ - private def checkConstraints(data: DataFrame): Unit = { - if (data.schema.fieldNames.length != data.schema.fieldNames.distinct.length) { - val duplicateColumns = data.schema.fieldNames.groupBy(identity).collect { - case (x, ys) if ys.length > 1 => "\"" + x + "\"" - }.mkString(", ") - throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " + - s"cannot save to JSON format") - } - } - - override val needConversion: Boolean = false - - override lazy val schema = userSpecifiedSchema.getOrElse { - InferSchema( - baseRDD(), - samplingRatio, - sqlContext.conf.columnNameOfCorruptRecord) - } - - override def buildScan(): RDD[Row] = { - // Rely on type erasure hack to pass RDD[InternalRow] back as RDD[Row] - JacksonParser( - baseRDD(), - schema, - sqlContext.conf.columnNameOfCorruptRecord).asInstanceOf[RDD[Row]] - } - - override def buildScan(requiredColumns: Seq[Attribute], filters: Seq[Expression]): RDD[Row] = { - // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row] - JacksonParser( - baseRDD(), - StructType.fromAttributes(requiredColumns), - sqlContext.conf.columnNameOfCorruptRecord).asInstanceOf[RDD[Row]] - } - - override def insert(data: DataFrame, overwrite: Boolean): Unit = { - // check if dataframe satisfies constraints - // before moving forward - checkConstraints(data) - - val filesystemPath = path match { - case Some(p) => new Path(p) - case None => - throw new IOException(s"Cannot INSERT into table with no path defined") - } - - val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) - - if (overwrite) { - if (fs.exists(filesystemPath)) { - JSONRelation.delete(filesystemPath, fs) - } - // Write the data. - data.toJSON.saveAsTextFile(filesystemPath.toString) - // Right now, we assume that the schema is not changed. We will not update the schema. - // schema = data.schema - } else { - // TODO: Support INSERT INTO - sys.error("JSON table only support INSERT OVERWRITE for now.") - } - } - - override def hashCode(): Int = 41 * (41 + path.hashCode) + schema.hashCode() - - override def equals(other: Any): Boolean = other match { - case that: JSONRelation => - (this.path == that.path) && this.schema.sameType(that.schema) - case _ => false - } -} - -private object JSONRelation { - - /** Delete the specified directory to overwrite it with new JSON data. */ - def delete(dir: Path, fs: FileSystem): Unit = { - var success: Boolean = false - val failMessage = s"Unable to clear output directory $dir prior to writing to JSON table" - try { - success = fs.delete(dir, true /* recursive */) - } catch { - case e: IOException => - throw new IOException(s"$failMessage\n${e.toString}") - } - if (!success) { - throw new IOException(failMessage) - } - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/optimizer/extendedOperatorOptimizations.scala b/sql/core/src/main/scala/org/apache/spark/sql/optimizer/extendedOperatorOptimizations.scala deleted file mode 100644 index 5a4dde575696..000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/optimizer/extendedOperatorOptimizations.scala +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.optimizer - -import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys -import org.apache.spark.sql.catalyst.plans.{Inner, LeftOuter, RightOuter, LeftSemi} -import org.apache.spark.sql.catalyst.plans.logical.{Project, Filter, Join, LogicalPlan} -import org.apache.spark.sql.catalyst.rules.Rule - -/** - * An optimization rule used to insert Filters to filter out rows whose equal join keys - * have at least one null values. For this kind of rows, they will not contribute to - * the join results of equal joins because a null does not equal another null. We can - * filter them out before shuffling join input rows. For example, we have two tables - * - * table1(key String, value Int) - * "str1"|1 - * null |2 - * - * table2(key String, value Int) - * "str1"|3 - * null |4 - * - * For a inner equal join, the result will be - * "str1"|1|"str1"|3 - * - * those two rows having null as the value of key will not contribute to the result. - * So, we can filter them out early. - * - * This optimization rule can be disabled by setting spark.sql.advancedOptimization to false. - * - */ -case class FilterNullsInJoinKey( - sqlContext: SQLContext) - extends Rule[LogicalPlan] { - - /** - * Checks if we need to add a Filter operator. We will add a Filter when - * there is any attribute in `keys` whose corresponding attribute of `keys` - * in `plan.output` is still nullable (`nullable` field is `true`). - */ - private def needsFilter(keys: Seq[Expression], plan: LogicalPlan): Boolean = { - val keyAttributeSet = AttributeSet(keys.filter(_.isInstanceOf[Attribute])) - plan.output.filter(keyAttributeSet.contains).exists(_.nullable) - } - - /** - * Adds a Filter operator to make sure that every attribute in `keys` is non-nullable. - */ - private def addFilterIfNecessary( - keys: Seq[Expression], - child: LogicalPlan): LogicalPlan = { - // We get all attributes from keys. - val attributes = keys.filter(_.isInstanceOf[Attribute]) - - // Then, we create a Filter to make sure these attributes are non-nullable. - val filter = - if (attributes.nonEmpty) { - Filter(Not(AtLeastNNulls(1, attributes)), child) - } else { - child - } - - filter - } - - /** - * We reconstruct the join condition. - */ - private def reconstructJoinCondition( - leftKeys: Seq[Expression], - rightKeys: Seq[Expression], - otherPredicate: Option[Expression]): Expression = { - // First, we rewrite the equal condition part. When we extract those keys, - // we use splitConjunctivePredicates. So, it is safe to use .reduce(And). - val rewrittenEqualJoinCondition = leftKeys.zip(rightKeys).map { - case (l, r) => EqualTo(l, r) - }.reduce(And) - - // Then, we add otherPredicate. When we extract those equal condition part, - // we use splitConjunctivePredicates. So, it is safe to use - // And(rewrittenEqualJoinCondition, c). - val rewrittenJoinCondition = otherPredicate - .map(c => And(rewrittenEqualJoinCondition, c)) - .getOrElse(rewrittenEqualJoinCondition) - - rewrittenJoinCondition - } - - def apply(plan: LogicalPlan): LogicalPlan = { - if (!sqlContext.conf.advancedSqlOptimizations) { - plan - } else { - plan transform { - case join: Join => join match { - // For a inner join having equal join condition part, we can add filters - // to both sides of the join operator. - case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right) - if needsFilter(leftKeys, left) || needsFilter(rightKeys, right) => - val withLeftFilter = addFilterIfNecessary(leftKeys, left) - val withRightFilter = addFilterIfNecessary(rightKeys, right) - val rewrittenJoinCondition = - reconstructJoinCondition(leftKeys, rightKeys, condition) - - Join(withLeftFilter, withRightFilter, Inner, Some(rewrittenJoinCondition)) - - // For a left outer join having equal join condition part, we can add a filter - // to the right side of the join operator. - case ExtractEquiJoinKeys(LeftOuter, leftKeys, rightKeys, condition, left, right) - if needsFilter(rightKeys, right) => - val withRightFilter = addFilterIfNecessary(rightKeys, right) - val rewrittenJoinCondition = - reconstructJoinCondition(leftKeys, rightKeys, condition) - - Join(left, withRightFilter, LeftOuter, Some(rewrittenJoinCondition)) - - // For a right outer join having equal join condition part, we can add a filter - // to the left side of the join operator. - case ExtractEquiJoinKeys(RightOuter, leftKeys, rightKeys, condition, left, right) - if needsFilter(leftKeys, left) => - val withLeftFilter = addFilterIfNecessary(leftKeys, left) - val rewrittenJoinCondition = - reconstructJoinCondition(leftKeys, rightKeys, condition) - - Join(withLeftFilter, right, RightOuter, Some(rewrittenJoinCondition)) - - // For a left semi join having equal join condition part, we can add filters - // to both sides of the join operator. - case ExtractEquiJoinKeys(LeftSemi, leftKeys, rightKeys, condition, left, right) - if needsFilter(leftKeys, left) || needsFilter(rightKeys, right) => - val withLeftFilter = addFilterIfNecessary(leftKeys, left) - val withRightFilter = addFilterIfNecessary(rightKeys, right) - val rewrittenJoinCondition = - reconstructJoinCondition(leftKeys, rightKeys, condition) - - Join(withLeftFilter, withRightFilter, LeftSemi, Some(rewrittenJoinCondition)) - - case other => other - } - } - } - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala deleted file mode 100644 index 975fec101d9c..000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.parquet - -import java.util.{Map => JMap} - -import scala.collection.JavaConversions.{iterableAsScalaIterable, mapAsJavaMap, mapAsScalaMap} - -import org.apache.hadoop.conf.Configuration -import org.apache.parquet.hadoop.api.ReadSupport.ReadContext -import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} -import org.apache.parquet.io.api.RecordMaterializer -import org.apache.parquet.schema.MessageType - -import org.apache.spark.Logging -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.types.StructType - -private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with Logging { - override def prepareForRead( - conf: Configuration, - keyValueMetaData: JMap[String, String], - fileSchema: MessageType, - readContext: ReadContext): RecordMaterializer[InternalRow] = { - log.debug(s"Preparing for read Parquet file with message type: $fileSchema") - - val toCatalyst = new CatalystSchemaConverter(conf) - val parquetRequestedSchema = readContext.getRequestedSchema - - val catalystRequestedSchema = - Option(readContext.getReadSupportMetadata).map(_.toMap).flatMap { metadata => - metadata - // First tries to read requested schema, which may result from projections - .get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA) - // If not available, tries to read Catalyst schema from file metadata. It's only - // available if the target file is written by Spark SQL. - .orElse(metadata.get(CatalystReadSupport.SPARK_METADATA_KEY)) - }.map(StructType.fromString).getOrElse { - logDebug("Catalyst schema not available, falling back to Parquet schema") - toCatalyst.convert(parquetRequestedSchema) - } - - logDebug(s"Catalyst schema used to read Parquet files: $catalystRequestedSchema") - new CatalystRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema) - } - - override def init(context: InitContext): ReadContext = { - val conf = context.getConfiguration - - // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst - // schema of this file from its the metadata. - val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA)) - - // Optional schema of requested columns, in the form of a string serialized from a Catalyst - // `StructType` containing all requested columns. - val maybeRequestedSchema = Option(conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)) - - // Below we construct a Parquet schema containing all requested columns. This schema tells - // Parquet which columns to read. - // - // If `maybeRequestedSchema` is defined, we assemble an equivalent Parquet schema. Otherwise, - // we have to fallback to the full file schema which contains all columns in the file. - // Obviously this may waste IO bandwidth since it may read more columns than requested. - // - // Two things to note: - // - // 1. It's possible that some requested columns don't exist in the target Parquet file. For - // example, in the case of schema merging, the globally merged schema may contain extra - // columns gathered from other Parquet files. These columns will be simply filled with nulls - // when actually reading the target Parquet file. - // - // 2. When `maybeRequestedSchema` is available, we can't simply convert the Catalyst schema to - // Parquet schema using `CatalystSchemaConverter`, because the mapping is not unique due to - // non-standard behaviors of some Parquet libraries/tools. For example, a Parquet file - // containing a single integer array field `f1` may have the following legacy 2-level - // structure: - // - // message root { - // optional group f1 (LIST) { - // required INT32 element; - // } - // } - // - // while `CatalystSchemaConverter` may generate a standard 3-level structure: - // - // message root { - // optional group f1 (LIST) { - // repeated group list { - // required INT32 element; - // } - // } - // } - // - // Apparently, we can't use the 2nd schema to read the target Parquet file as they have - // different physical structures. - val parquetRequestedSchema = - maybeRequestedSchema.fold(context.getFileSchema) { schemaString => - val toParquet = new CatalystSchemaConverter(conf) - val fileSchema = context.getFileSchema.asGroupType() - val fileFieldNames = fileSchema.getFields.map(_.getName).toSet - - StructType - // Deserializes the Catalyst schema of requested columns - .fromString(schemaString) - .map { field => - if (fileFieldNames.contains(field.name)) { - // If the field exists in the target Parquet file, extracts the field type from the - // full file schema and makes a single-field Parquet schema - new MessageType("root", fileSchema.getType(field.name)) - } else { - // Otherwise, just resorts to `CatalystSchemaConverter` - toParquet.convert(StructType(Array(field))) - } - } - // Merges all single-field Parquet schemas to form a complete schema for all requested - // columns. Note that it's possible that no columns are requested at all (e.g., count - // some partition column of a partitioned Parquet table). That's why `fold` is used here - // and always fallback to an empty Parquet schema. - .fold(new MessageType("root")) { - _ union _ - } - } - - val metadata = - Map.empty[String, String] ++ - maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++ - maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _) - - logInfo(s"Going to read Parquet file with these requested columns: $parquetRequestedSchema") - new ReadContext(parquetRequestedSchema, metadata) - } -} - -private[parquet] object CatalystReadSupport { - val SPARK_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema" - - val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata" -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala index 4d942e4f9287..3780cbbcc963 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala @@ -36,6 +36,15 @@ abstract class Filter */ case class EqualTo(attribute: String, value: Any) extends Filter +/** + * Performs equality comparison, similar to [[EqualTo]]. However, this differs from [[EqualTo]] + * in that it returns `true` (rather than NULL) if both inputs are NULL, and `false` + * (rather than NULL) if one of the input is NULL and the other is not NULL. + * + * @since 1.5.0 + */ +case class EqualNullSafe(attribute: String, value: Any) extends Filter + /** * A filter that evaluates to `true` iff the attribute evaluates to a value * greater than `value`. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala index 7126145ddc01..7b030b7d73bd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala @@ -31,12 +31,38 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection -import org.apache.spark.sql.execution.RDDConversions +import org.apache.spark.sql.execution.{FileRelation, RDDConversions} import org.apache.spark.sql.execution.datasources.{PartitioningUtils, PartitionSpec, Partition} import org.apache.spark.sql.types.StructType import org.apache.spark.sql._ import org.apache.spark.util.SerializableConfiguration +/** + * ::DeveloperApi:: + * Data sources should implement this trait so that they can register an alias to their data source. + * This allows users to give the data source alias as the format type over the fully qualified + * class name. + * + * A new instance of this class with be instantiated each time a DDL call is made. + * + * @since 1.5.0 + */ +@DeveloperApi +trait DataSourceRegister { + + /** + * The string that represents the format that this data source provider uses. This is + * overridden by children to provide a nice alias for the data source. For example: + * + * {{{ + * override def format(): String = "parquet" + * }}} + * + * @since 1.5.0 + */ + def shortName(): String +} + /** * ::DeveloperApi:: * Implemented by objects that produce relations for a specific kind of data source. When @@ -342,18 +368,17 @@ abstract class OutputWriter { * @since 1.4.0 */ def close(): Unit -} -/** - * This is an internal, private version of [[OutputWriter]] with an writeInternal method that - * accepts an [[InternalRow]] rather than an [[Row]]. Data sources that return this must have - * the conversion flag set to false. - */ -private[sql] abstract class OutputWriterInternal extends OutputWriter { + private var converter: InternalRow => Row = _ - override def write(row: Row): Unit = throw new UnsupportedOperationException + protected[sql] def initConverter(dataSchema: StructType) = { + converter = + CatalystTypeConverters.createToScalaConverter(dataSchema).asInstanceOf[InternalRow => Row] + } - def writeInternal(row: InternalRow): Unit + protected[sql] def writeInternal(row: InternalRow): Unit = { + write(converter(row)) + } } /** @@ -381,9 +406,9 @@ private[sql] abstract class OutputWriterInternal extends OutputWriter { */ @Experimental abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[PartitionSpec]) - extends BaseRelation with Logging { + extends BaseRelation with FileRelation with Logging { - logInfo("Constructing HadoopFsRelation") + override def toString: String = getClass.getSimpleName + paths.mkString("[", ",", "]") def this() = this(None) @@ -461,8 +486,8 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio val spec = discoverPartitions() val partitionColumnTypes = spec.partitionColumns.map(_.dataType) val castedPartitions = spec.partitions.map { case p @ Partition(values, path) => - val literals = values.toSeq.zip(partitionColumnTypes).map { - case (value, dataType) => Literal.create(value, dataType) + val literals = partitionColumnTypes.zipWithIndex.map { case (dt, i) => + Literal.create(values.get(i, dt), dt) } val castedValues = partitionSchema.zip(literals).map { case (field, literal) => Cast(literal, field.dataType).eval() @@ -491,6 +516,10 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio */ def paths: Array[String] + override def inputFiles: Array[String] = cachedLeafStatuses().map(_.getPath.toString).toArray + + override def sizeInBytes: Long = cachedLeafStatuses().map(_.getLen).sum + /** * Partition columns. Can be either defined by [[userDefinedPartitionColumns]] or automatically * discovered. Note that they should always be nullable. @@ -535,7 +564,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio }) } - private[sql] final def buildScan( + final private[sql] def buildScan( requiredColumns: Array[String], filters: Array[Filter], inputPaths: Array[String], diff --git a/sql/core/src/test/README.md b/sql/core/src/test/README.md index 3dd9861b4896..421c2ea4f7ae 100644 --- a/sql/core/src/test/README.md +++ b/sql/core/src/test/README.md @@ -6,23 +6,19 @@ The following directories and files are used for Parquet compatibility tests: . ├── README.md # This file ├── avro -│   ├── parquet-compat.avdl # Testing Avro IDL -│   └── parquet-compat.avpr # !! NO TOUCH !! Protocol file generated from parquet-compat.avdl +│   ├── *.avdl # Testing Avro IDL(s) +│   └── *.avpr # !! NO TOUCH !! Protocol files generated from Avro IDL(s) ├── gen-java # !! NO TOUCH !! Generated Java code ├── scripts -│   └── gen-code.sh # Script used to generate Java code for Thrift and Avro +│   ├── gen-avro.sh # Script used to generate Java code for Avro +│   └── gen-thrift.sh # Script used to generate Java code for Thrift └── thrift - └── parquet-compat.thrift # Testing Thrift schema + └── *.thrift # Testing Thrift schema(s) ``` -Generated Java code are used in the following test suites: - -- `org.apache.spark.sql.parquet.ParquetAvroCompatibilitySuite` -- `org.apache.spark.sql.parquet.ParquetThriftCompatibilitySuite` - To avoid code generation during build time, Java code generated from testing Thrift schema and Avro IDL are also checked in. -When updating the testing Thrift schema and Avro IDL, please run `gen-code.sh` to update all the generated Java code. +When updating the testing Thrift schema and Avro IDL, please run `gen-avro.sh` and `gen-thrift.sh` accordingly to update generated Java code. ## Prerequisites diff --git a/sql/core/src/test/avro/parquet-compat.avdl b/sql/core/src/test/avro/parquet-compat.avdl index 24729f6143e6..c5eb5b5164cf 100644 --- a/sql/core/src/test/avro/parquet-compat.avdl +++ b/sql/core/src/test/avro/parquet-compat.avdl @@ -16,14 +16,25 @@ */ // This is a test protocol for testing parquet-avro compatibility. -@namespace("org.apache.spark.sql.parquet.test.avro") +@namespace("org.apache.spark.sql.execution.datasources.parquet.test.avro") protocol CompatibilityTest { + enum Suit { + SPADES, + HEARTS, + DIAMONDS, + CLUBS + } + + record ParquetEnum { + Suit suit; + } + record Nested { array nested_ints_column; string nested_string_column; } - record ParquetAvroCompat { + record AvroPrimitives { boolean bool_column; int int_column; long long_column; @@ -31,7 +42,9 @@ protocol CompatibilityTest { double double_column; bytes binary_column; string string_column; + } + record AvroOptionalPrimitives { union { null, boolean } maybe_bool_column; union { null, int } maybe_int_column; union { null, long } maybe_long_column; @@ -39,7 +52,22 @@ protocol CompatibilityTest { union { null, double } maybe_double_column; union { null, bytes } maybe_binary_column; union { null, string } maybe_string_column; + } + record AvroNonNullableArrays { + array strings_column; + union { null, array } maybe_ints_column; + } + + record AvroArrayOfArray { + array> int_arrays_column; + } + + record AvroMapOfArray { + map> string_to_ints_column; + } + + record ParquetAvroCompat { array strings_column; map string_to_int_column; map> complex_column; diff --git a/sql/core/src/test/avro/parquet-compat.avpr b/sql/core/src/test/avro/parquet-compat.avpr index a83b7c990dd2..9ad315b74fb4 100644 --- a/sql/core/src/test/avro/parquet-compat.avpr +++ b/sql/core/src/test/avro/parquet-compat.avpr @@ -1,7 +1,18 @@ { "protocol" : "CompatibilityTest", - "namespace" : "org.apache.spark.sql.parquet.test.avro", + "namespace" : "org.apache.spark.sql.execution.datasources.parquet.test.avro", "types" : [ { + "type" : "enum", + "name" : "Suit", + "symbols" : [ "SPADES", "HEARTS", "DIAMONDS", "CLUBS" ] + }, { + "type" : "record", + "name" : "ParquetEnum", + "fields" : [ { + "name" : "suit", + "type" : "Suit" + } ] + }, { "type" : "record", "name" : "Nested", "fields" : [ { @@ -16,7 +27,7 @@ } ] }, { "type" : "record", - "name" : "ParquetAvroCompat", + "name" : "AvroPrimitives", "fields" : [ { "name" : "bool_column", "type" : "boolean" @@ -38,7 +49,11 @@ }, { "name" : "string_column", "type" : "string" - }, { + } ] + }, { + "type" : "record", + "name" : "AvroOptionalPrimitives", + "fields" : [ { "name" : "maybe_bool_column", "type" : [ "null", "boolean" ] }, { @@ -59,7 +74,53 @@ }, { "name" : "maybe_string_column", "type" : [ "null", "string" ] + } ] + }, { + "type" : "record", + "name" : "AvroNonNullableArrays", + "fields" : [ { + "name" : "strings_column", + "type" : { + "type" : "array", + "items" : "string" + } }, { + "name" : "maybe_ints_column", + "type" : [ "null", { + "type" : "array", + "items" : "int" + } ] + } ] + }, { + "type" : "record", + "name" : "AvroArrayOfArray", + "fields" : [ { + "name" : "int_arrays_column", + "type" : { + "type" : "array", + "items" : { + "type" : "array", + "items" : "int" + } + } + } ] + }, { + "type" : "record", + "name" : "AvroMapOfArray", + "fields" : [ { + "name" : "string_to_ints_column", + "type" : { + "type" : "map", + "values" : { + "type" : "array", + "items" : "int" + } + } + } ] + }, { + "type" : "record", + "name" : "ParquetAvroCompat", + "fields" : [ { "name" : "strings_column", "type" : { "type" : "array", diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroArrayOfArray.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroArrayOfArray.java new file mode 100644 index 000000000000..ee327827903e --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroArrayOfArray.java @@ -0,0 +1,142 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.execution.datasources.parquet.test.avro; +@SuppressWarnings("all") +@org.apache.avro.specific.AvroGenerated +public class AvroArrayOfArray extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"AvroArrayOfArray\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"int_arrays_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"array\",\"items\":\"int\"}}}]}"); + public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } + @Deprecated public java.util.List> int_arrays_column; + + /** + * Default constructor. Note that this does not initialize fields + * to their default values from the schema. If that is desired then + * one should use newBuilder(). + */ + public AvroArrayOfArray() {} + + /** + * All-args constructor. + */ + public AvroArrayOfArray(java.util.List> int_arrays_column) { + this.int_arrays_column = int_arrays_column; + } + + public org.apache.avro.Schema getSchema() { return SCHEMA$; } + // Used by DatumWriter. Applications should not call. + public java.lang.Object get(int field$) { + switch (field$) { + case 0: return int_arrays_column; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + // Used by DatumReader. Applications should not call. + @SuppressWarnings(value="unchecked") + public void put(int field$, java.lang.Object value$) { + switch (field$) { + case 0: int_arrays_column = (java.util.List>)value$; break; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + + /** + * Gets the value of the 'int_arrays_column' field. + */ + public java.util.List> getIntArraysColumn() { + return int_arrays_column; + } + + /** + * Sets the value of the 'int_arrays_column' field. + * @param value the value to set. + */ + public void setIntArraysColumn(java.util.List> value) { + this.int_arrays_column = value; + } + + /** Creates a new AvroArrayOfArray RecordBuilder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder newBuilder() { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder(); + } + + /** Creates a new AvroArrayOfArray RecordBuilder by copying an existing Builder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder(other); + } + + /** Creates a new AvroArrayOfArray RecordBuilder by copying an existing AvroArrayOfArray instance */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder(other); + } + + /** + * RecordBuilder for AvroArrayOfArray instances. + */ + public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase + implements org.apache.avro.data.RecordBuilder { + + private java.util.List> int_arrays_column; + + /** Creates a new Builder */ + private Builder() { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.SCHEMA$); + } + + /** Creates a Builder by copying an existing Builder */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder other) { + super(other); + if (isValidValue(fields()[0], other.int_arrays_column)) { + this.int_arrays_column = data().deepCopy(fields()[0].schema(), other.int_arrays_column); + fieldSetFlags()[0] = true; + } + } + + /** Creates a Builder by copying an existing AvroArrayOfArray instance */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray other) { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.SCHEMA$); + if (isValidValue(fields()[0], other.int_arrays_column)) { + this.int_arrays_column = data().deepCopy(fields()[0].schema(), other.int_arrays_column); + fieldSetFlags()[0] = true; + } + } + + /** Gets the value of the 'int_arrays_column' field */ + public java.util.List> getIntArraysColumn() { + return int_arrays_column; + } + + /** Sets the value of the 'int_arrays_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder setIntArraysColumn(java.util.List> value) { + validate(fields()[0], value); + this.int_arrays_column = value; + fieldSetFlags()[0] = true; + return this; + } + + /** Checks whether the 'int_arrays_column' field has been set */ + public boolean hasIntArraysColumn() { + return fieldSetFlags()[0]; + } + + /** Clears the value of the 'int_arrays_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder clearIntArraysColumn() { + int_arrays_column = null; + fieldSetFlags()[0] = false; + return this; + } + + @Override + public AvroArrayOfArray build() { + try { + AvroArrayOfArray record = new AvroArrayOfArray(); + record.int_arrays_column = fieldSetFlags()[0] ? this.int_arrays_column : (java.util.List>) defaultValue(fields()[0]); + return record; + } catch (Exception e) { + throw new org.apache.avro.AvroRuntimeException(e); + } + } + } +} diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroMapOfArray.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroMapOfArray.java new file mode 100644 index 000000000000..727f6a7bf733 --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroMapOfArray.java @@ -0,0 +1,142 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.execution.datasources.parquet.test.avro; +@SuppressWarnings("all") +@org.apache.avro.specific.AvroGenerated +public class AvroMapOfArray extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"AvroMapOfArray\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"string_to_ints_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":\"int\"},\"avro.java.string\":\"String\"}}]}"); + public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } + @Deprecated public java.util.Map> string_to_ints_column; + + /** + * Default constructor. Note that this does not initialize fields + * to their default values from the schema. If that is desired then + * one should use newBuilder(). + */ + public AvroMapOfArray() {} + + /** + * All-args constructor. + */ + public AvroMapOfArray(java.util.Map> string_to_ints_column) { + this.string_to_ints_column = string_to_ints_column; + } + + public org.apache.avro.Schema getSchema() { return SCHEMA$; } + // Used by DatumWriter. Applications should not call. + public java.lang.Object get(int field$) { + switch (field$) { + case 0: return string_to_ints_column; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + // Used by DatumReader. Applications should not call. + @SuppressWarnings(value="unchecked") + public void put(int field$, java.lang.Object value$) { + switch (field$) { + case 0: string_to_ints_column = (java.util.Map>)value$; break; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + + /** + * Gets the value of the 'string_to_ints_column' field. + */ + public java.util.Map> getStringToIntsColumn() { + return string_to_ints_column; + } + + /** + * Sets the value of the 'string_to_ints_column' field. + * @param value the value to set. + */ + public void setStringToIntsColumn(java.util.Map> value) { + this.string_to_ints_column = value; + } + + /** Creates a new AvroMapOfArray RecordBuilder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder newBuilder() { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder(); + } + + /** Creates a new AvroMapOfArray RecordBuilder by copying an existing Builder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder(other); + } + + /** Creates a new AvroMapOfArray RecordBuilder by copying an existing AvroMapOfArray instance */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder(other); + } + + /** + * RecordBuilder for AvroMapOfArray instances. + */ + public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase + implements org.apache.avro.data.RecordBuilder { + + private java.util.Map> string_to_ints_column; + + /** Creates a new Builder */ + private Builder() { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.SCHEMA$); + } + + /** Creates a Builder by copying an existing Builder */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder other) { + super(other); + if (isValidValue(fields()[0], other.string_to_ints_column)) { + this.string_to_ints_column = data().deepCopy(fields()[0].schema(), other.string_to_ints_column); + fieldSetFlags()[0] = true; + } + } + + /** Creates a Builder by copying an existing AvroMapOfArray instance */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray other) { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.SCHEMA$); + if (isValidValue(fields()[0], other.string_to_ints_column)) { + this.string_to_ints_column = data().deepCopy(fields()[0].schema(), other.string_to_ints_column); + fieldSetFlags()[0] = true; + } + } + + /** Gets the value of the 'string_to_ints_column' field */ + public java.util.Map> getStringToIntsColumn() { + return string_to_ints_column; + } + + /** Sets the value of the 'string_to_ints_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder setStringToIntsColumn(java.util.Map> value) { + validate(fields()[0], value); + this.string_to_ints_column = value; + fieldSetFlags()[0] = true; + return this; + } + + /** Checks whether the 'string_to_ints_column' field has been set */ + public boolean hasStringToIntsColumn() { + return fieldSetFlags()[0]; + } + + /** Clears the value of the 'string_to_ints_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder clearStringToIntsColumn() { + string_to_ints_column = null; + fieldSetFlags()[0] = false; + return this; + } + + @Override + public AvroMapOfArray build() { + try { + AvroMapOfArray record = new AvroMapOfArray(); + record.string_to_ints_column = fieldSetFlags()[0] ? this.string_to_ints_column : (java.util.Map>) defaultValue(fields()[0]); + return record; + } catch (Exception e) { + throw new org.apache.avro.AvroRuntimeException(e); + } + } + } +} diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroNonNullableArrays.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroNonNullableArrays.java new file mode 100644 index 000000000000..934793f42f9c --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroNonNullableArrays.java @@ -0,0 +1,196 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.execution.datasources.parquet.test.avro; +@SuppressWarnings("all") +@org.apache.avro.specific.AvroGenerated +public class AvroNonNullableArrays extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"AvroNonNullableArrays\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"maybe_ints_column\",\"type\":[\"null\",{\"type\":\"array\",\"items\":\"int\"}]}]}"); + public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } + @Deprecated public java.util.List strings_column; + @Deprecated public java.util.List maybe_ints_column; + + /** + * Default constructor. Note that this does not initialize fields + * to their default values from the schema. If that is desired then + * one should use newBuilder(). + */ + public AvroNonNullableArrays() {} + + /** + * All-args constructor. + */ + public AvroNonNullableArrays(java.util.List strings_column, java.util.List maybe_ints_column) { + this.strings_column = strings_column; + this.maybe_ints_column = maybe_ints_column; + } + + public org.apache.avro.Schema getSchema() { return SCHEMA$; } + // Used by DatumWriter. Applications should not call. + public java.lang.Object get(int field$) { + switch (field$) { + case 0: return strings_column; + case 1: return maybe_ints_column; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + // Used by DatumReader. Applications should not call. + @SuppressWarnings(value="unchecked") + public void put(int field$, java.lang.Object value$) { + switch (field$) { + case 0: strings_column = (java.util.List)value$; break; + case 1: maybe_ints_column = (java.util.List)value$; break; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + + /** + * Gets the value of the 'strings_column' field. + */ + public java.util.List getStringsColumn() { + return strings_column; + } + + /** + * Sets the value of the 'strings_column' field. + * @param value the value to set. + */ + public void setStringsColumn(java.util.List value) { + this.strings_column = value; + } + + /** + * Gets the value of the 'maybe_ints_column' field. + */ + public java.util.List getMaybeIntsColumn() { + return maybe_ints_column; + } + + /** + * Sets the value of the 'maybe_ints_column' field. + * @param value the value to set. + */ + public void setMaybeIntsColumn(java.util.List value) { + this.maybe_ints_column = value; + } + + /** Creates a new AvroNonNullableArrays RecordBuilder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder newBuilder() { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder(); + } + + /** Creates a new AvroNonNullableArrays RecordBuilder by copying an existing Builder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder(other); + } + + /** Creates a new AvroNonNullableArrays RecordBuilder by copying an existing AvroNonNullableArrays instance */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder(other); + } + + /** + * RecordBuilder for AvroNonNullableArrays instances. + */ + public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase + implements org.apache.avro.data.RecordBuilder { + + private java.util.List strings_column; + private java.util.List maybe_ints_column; + + /** Creates a new Builder */ + private Builder() { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.SCHEMA$); + } + + /** Creates a Builder by copying an existing Builder */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder other) { + super(other); + if (isValidValue(fields()[0], other.strings_column)) { + this.strings_column = data().deepCopy(fields()[0].schema(), other.strings_column); + fieldSetFlags()[0] = true; + } + if (isValidValue(fields()[1], other.maybe_ints_column)) { + this.maybe_ints_column = data().deepCopy(fields()[1].schema(), other.maybe_ints_column); + fieldSetFlags()[1] = true; + } + } + + /** Creates a Builder by copying an existing AvroNonNullableArrays instance */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays other) { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.SCHEMA$); + if (isValidValue(fields()[0], other.strings_column)) { + this.strings_column = data().deepCopy(fields()[0].schema(), other.strings_column); + fieldSetFlags()[0] = true; + } + if (isValidValue(fields()[1], other.maybe_ints_column)) { + this.maybe_ints_column = data().deepCopy(fields()[1].schema(), other.maybe_ints_column); + fieldSetFlags()[1] = true; + } + } + + /** Gets the value of the 'strings_column' field */ + public java.util.List getStringsColumn() { + return strings_column; + } + + /** Sets the value of the 'strings_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder setStringsColumn(java.util.List value) { + validate(fields()[0], value); + this.strings_column = value; + fieldSetFlags()[0] = true; + return this; + } + + /** Checks whether the 'strings_column' field has been set */ + public boolean hasStringsColumn() { + return fieldSetFlags()[0]; + } + + /** Clears the value of the 'strings_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder clearStringsColumn() { + strings_column = null; + fieldSetFlags()[0] = false; + return this; + } + + /** Gets the value of the 'maybe_ints_column' field */ + public java.util.List getMaybeIntsColumn() { + return maybe_ints_column; + } + + /** Sets the value of the 'maybe_ints_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder setMaybeIntsColumn(java.util.List value) { + validate(fields()[1], value); + this.maybe_ints_column = value; + fieldSetFlags()[1] = true; + return this; + } + + /** Checks whether the 'maybe_ints_column' field has been set */ + public boolean hasMaybeIntsColumn() { + return fieldSetFlags()[1]; + } + + /** Clears the value of the 'maybe_ints_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder clearMaybeIntsColumn() { + maybe_ints_column = null; + fieldSetFlags()[1] = false; + return this; + } + + @Override + public AvroNonNullableArrays build() { + try { + AvroNonNullableArrays record = new AvroNonNullableArrays(); + record.strings_column = fieldSetFlags()[0] ? this.strings_column : (java.util.List) defaultValue(fields()[0]); + record.maybe_ints_column = fieldSetFlags()[1] ? this.maybe_ints_column : (java.util.List) defaultValue(fields()[1]); + return record; + } catch (Exception e) { + throw new org.apache.avro.AvroRuntimeException(e); + } + } + } +} diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroOptionalPrimitives.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroOptionalPrimitives.java new file mode 100644 index 000000000000..e4d1ead8dd15 --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroOptionalPrimitives.java @@ -0,0 +1,466 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.execution.datasources.parquet.test.avro; +@SuppressWarnings("all") +@org.apache.avro.specific.AvroGenerated +public class AvroOptionalPrimitives extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"AvroOptionalPrimitives\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]}]}"); + public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } + @Deprecated public java.lang.Boolean maybe_bool_column; + @Deprecated public java.lang.Integer maybe_int_column; + @Deprecated public java.lang.Long maybe_long_column; + @Deprecated public java.lang.Float maybe_float_column; + @Deprecated public java.lang.Double maybe_double_column; + @Deprecated public java.nio.ByteBuffer maybe_binary_column; + @Deprecated public java.lang.String maybe_string_column; + + /** + * Default constructor. Note that this does not initialize fields + * to their default values from the schema. If that is desired then + * one should use newBuilder(). + */ + public AvroOptionalPrimitives() {} + + /** + * All-args constructor. + */ + public AvroOptionalPrimitives(java.lang.Boolean maybe_bool_column, java.lang.Integer maybe_int_column, java.lang.Long maybe_long_column, java.lang.Float maybe_float_column, java.lang.Double maybe_double_column, java.nio.ByteBuffer maybe_binary_column, java.lang.String maybe_string_column) { + this.maybe_bool_column = maybe_bool_column; + this.maybe_int_column = maybe_int_column; + this.maybe_long_column = maybe_long_column; + this.maybe_float_column = maybe_float_column; + this.maybe_double_column = maybe_double_column; + this.maybe_binary_column = maybe_binary_column; + this.maybe_string_column = maybe_string_column; + } + + public org.apache.avro.Schema getSchema() { return SCHEMA$; } + // Used by DatumWriter. Applications should not call. + public java.lang.Object get(int field$) { + switch (field$) { + case 0: return maybe_bool_column; + case 1: return maybe_int_column; + case 2: return maybe_long_column; + case 3: return maybe_float_column; + case 4: return maybe_double_column; + case 5: return maybe_binary_column; + case 6: return maybe_string_column; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + // Used by DatumReader. Applications should not call. + @SuppressWarnings(value="unchecked") + public void put(int field$, java.lang.Object value$) { + switch (field$) { + case 0: maybe_bool_column = (java.lang.Boolean)value$; break; + case 1: maybe_int_column = (java.lang.Integer)value$; break; + case 2: maybe_long_column = (java.lang.Long)value$; break; + case 3: maybe_float_column = (java.lang.Float)value$; break; + case 4: maybe_double_column = (java.lang.Double)value$; break; + case 5: maybe_binary_column = (java.nio.ByteBuffer)value$; break; + case 6: maybe_string_column = (java.lang.String)value$; break; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + + /** + * Gets the value of the 'maybe_bool_column' field. + */ + public java.lang.Boolean getMaybeBoolColumn() { + return maybe_bool_column; + } + + /** + * Sets the value of the 'maybe_bool_column' field. + * @param value the value to set. + */ + public void setMaybeBoolColumn(java.lang.Boolean value) { + this.maybe_bool_column = value; + } + + /** + * Gets the value of the 'maybe_int_column' field. + */ + public java.lang.Integer getMaybeIntColumn() { + return maybe_int_column; + } + + /** + * Sets the value of the 'maybe_int_column' field. + * @param value the value to set. + */ + public void setMaybeIntColumn(java.lang.Integer value) { + this.maybe_int_column = value; + } + + /** + * Gets the value of the 'maybe_long_column' field. + */ + public java.lang.Long getMaybeLongColumn() { + return maybe_long_column; + } + + /** + * Sets the value of the 'maybe_long_column' field. + * @param value the value to set. + */ + public void setMaybeLongColumn(java.lang.Long value) { + this.maybe_long_column = value; + } + + /** + * Gets the value of the 'maybe_float_column' field. + */ + public java.lang.Float getMaybeFloatColumn() { + return maybe_float_column; + } + + /** + * Sets the value of the 'maybe_float_column' field. + * @param value the value to set. + */ + public void setMaybeFloatColumn(java.lang.Float value) { + this.maybe_float_column = value; + } + + /** + * Gets the value of the 'maybe_double_column' field. + */ + public java.lang.Double getMaybeDoubleColumn() { + return maybe_double_column; + } + + /** + * Sets the value of the 'maybe_double_column' field. + * @param value the value to set. + */ + public void setMaybeDoubleColumn(java.lang.Double value) { + this.maybe_double_column = value; + } + + /** + * Gets the value of the 'maybe_binary_column' field. + */ + public java.nio.ByteBuffer getMaybeBinaryColumn() { + return maybe_binary_column; + } + + /** + * Sets the value of the 'maybe_binary_column' field. + * @param value the value to set. + */ + public void setMaybeBinaryColumn(java.nio.ByteBuffer value) { + this.maybe_binary_column = value; + } + + /** + * Gets the value of the 'maybe_string_column' field. + */ + public java.lang.String getMaybeStringColumn() { + return maybe_string_column; + } + + /** + * Sets the value of the 'maybe_string_column' field. + * @param value the value to set. + */ + public void setMaybeStringColumn(java.lang.String value) { + this.maybe_string_column = value; + } + + /** Creates a new AvroOptionalPrimitives RecordBuilder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder newBuilder() { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder(); + } + + /** Creates a new AvroOptionalPrimitives RecordBuilder by copying an existing Builder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder(other); + } + + /** Creates a new AvroOptionalPrimitives RecordBuilder by copying an existing AvroOptionalPrimitives instance */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder(other); + } + + /** + * RecordBuilder for AvroOptionalPrimitives instances. + */ + public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase + implements org.apache.avro.data.RecordBuilder { + + private java.lang.Boolean maybe_bool_column; + private java.lang.Integer maybe_int_column; + private java.lang.Long maybe_long_column; + private java.lang.Float maybe_float_column; + private java.lang.Double maybe_double_column; + private java.nio.ByteBuffer maybe_binary_column; + private java.lang.String maybe_string_column; + + /** Creates a new Builder */ + private Builder() { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.SCHEMA$); + } + + /** Creates a Builder by copying an existing Builder */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder other) { + super(other); + if (isValidValue(fields()[0], other.maybe_bool_column)) { + this.maybe_bool_column = data().deepCopy(fields()[0].schema(), other.maybe_bool_column); + fieldSetFlags()[0] = true; + } + if (isValidValue(fields()[1], other.maybe_int_column)) { + this.maybe_int_column = data().deepCopy(fields()[1].schema(), other.maybe_int_column); + fieldSetFlags()[1] = true; + } + if (isValidValue(fields()[2], other.maybe_long_column)) { + this.maybe_long_column = data().deepCopy(fields()[2].schema(), other.maybe_long_column); + fieldSetFlags()[2] = true; + } + if (isValidValue(fields()[3], other.maybe_float_column)) { + this.maybe_float_column = data().deepCopy(fields()[3].schema(), other.maybe_float_column); + fieldSetFlags()[3] = true; + } + if (isValidValue(fields()[4], other.maybe_double_column)) { + this.maybe_double_column = data().deepCopy(fields()[4].schema(), other.maybe_double_column); + fieldSetFlags()[4] = true; + } + if (isValidValue(fields()[5], other.maybe_binary_column)) { + this.maybe_binary_column = data().deepCopy(fields()[5].schema(), other.maybe_binary_column); + fieldSetFlags()[5] = true; + } + if (isValidValue(fields()[6], other.maybe_string_column)) { + this.maybe_string_column = data().deepCopy(fields()[6].schema(), other.maybe_string_column); + fieldSetFlags()[6] = true; + } + } + + /** Creates a Builder by copying an existing AvroOptionalPrimitives instance */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives other) { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.SCHEMA$); + if (isValidValue(fields()[0], other.maybe_bool_column)) { + this.maybe_bool_column = data().deepCopy(fields()[0].schema(), other.maybe_bool_column); + fieldSetFlags()[0] = true; + } + if (isValidValue(fields()[1], other.maybe_int_column)) { + this.maybe_int_column = data().deepCopy(fields()[1].schema(), other.maybe_int_column); + fieldSetFlags()[1] = true; + } + if (isValidValue(fields()[2], other.maybe_long_column)) { + this.maybe_long_column = data().deepCopy(fields()[2].schema(), other.maybe_long_column); + fieldSetFlags()[2] = true; + } + if (isValidValue(fields()[3], other.maybe_float_column)) { + this.maybe_float_column = data().deepCopy(fields()[3].schema(), other.maybe_float_column); + fieldSetFlags()[3] = true; + } + if (isValidValue(fields()[4], other.maybe_double_column)) { + this.maybe_double_column = data().deepCopy(fields()[4].schema(), other.maybe_double_column); + fieldSetFlags()[4] = true; + } + if (isValidValue(fields()[5], other.maybe_binary_column)) { + this.maybe_binary_column = data().deepCopy(fields()[5].schema(), other.maybe_binary_column); + fieldSetFlags()[5] = true; + } + if (isValidValue(fields()[6], other.maybe_string_column)) { + this.maybe_string_column = data().deepCopy(fields()[6].schema(), other.maybe_string_column); + fieldSetFlags()[6] = true; + } + } + + /** Gets the value of the 'maybe_bool_column' field */ + public java.lang.Boolean getMaybeBoolColumn() { + return maybe_bool_column; + } + + /** Sets the value of the 'maybe_bool_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeBoolColumn(java.lang.Boolean value) { + validate(fields()[0], value); + this.maybe_bool_column = value; + fieldSetFlags()[0] = true; + return this; + } + + /** Checks whether the 'maybe_bool_column' field has been set */ + public boolean hasMaybeBoolColumn() { + return fieldSetFlags()[0]; + } + + /** Clears the value of the 'maybe_bool_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeBoolColumn() { + maybe_bool_column = null; + fieldSetFlags()[0] = false; + return this; + } + + /** Gets the value of the 'maybe_int_column' field */ + public java.lang.Integer getMaybeIntColumn() { + return maybe_int_column; + } + + /** Sets the value of the 'maybe_int_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeIntColumn(java.lang.Integer value) { + validate(fields()[1], value); + this.maybe_int_column = value; + fieldSetFlags()[1] = true; + return this; + } + + /** Checks whether the 'maybe_int_column' field has been set */ + public boolean hasMaybeIntColumn() { + return fieldSetFlags()[1]; + } + + /** Clears the value of the 'maybe_int_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeIntColumn() { + maybe_int_column = null; + fieldSetFlags()[1] = false; + return this; + } + + /** Gets the value of the 'maybe_long_column' field */ + public java.lang.Long getMaybeLongColumn() { + return maybe_long_column; + } + + /** Sets the value of the 'maybe_long_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeLongColumn(java.lang.Long value) { + validate(fields()[2], value); + this.maybe_long_column = value; + fieldSetFlags()[2] = true; + return this; + } + + /** Checks whether the 'maybe_long_column' field has been set */ + public boolean hasMaybeLongColumn() { + return fieldSetFlags()[2]; + } + + /** Clears the value of the 'maybe_long_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeLongColumn() { + maybe_long_column = null; + fieldSetFlags()[2] = false; + return this; + } + + /** Gets the value of the 'maybe_float_column' field */ + public java.lang.Float getMaybeFloatColumn() { + return maybe_float_column; + } + + /** Sets the value of the 'maybe_float_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeFloatColumn(java.lang.Float value) { + validate(fields()[3], value); + this.maybe_float_column = value; + fieldSetFlags()[3] = true; + return this; + } + + /** Checks whether the 'maybe_float_column' field has been set */ + public boolean hasMaybeFloatColumn() { + return fieldSetFlags()[3]; + } + + /** Clears the value of the 'maybe_float_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeFloatColumn() { + maybe_float_column = null; + fieldSetFlags()[3] = false; + return this; + } + + /** Gets the value of the 'maybe_double_column' field */ + public java.lang.Double getMaybeDoubleColumn() { + return maybe_double_column; + } + + /** Sets the value of the 'maybe_double_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeDoubleColumn(java.lang.Double value) { + validate(fields()[4], value); + this.maybe_double_column = value; + fieldSetFlags()[4] = true; + return this; + } + + /** Checks whether the 'maybe_double_column' field has been set */ + public boolean hasMaybeDoubleColumn() { + return fieldSetFlags()[4]; + } + + /** Clears the value of the 'maybe_double_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeDoubleColumn() { + maybe_double_column = null; + fieldSetFlags()[4] = false; + return this; + } + + /** Gets the value of the 'maybe_binary_column' field */ + public java.nio.ByteBuffer getMaybeBinaryColumn() { + return maybe_binary_column; + } + + /** Sets the value of the 'maybe_binary_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeBinaryColumn(java.nio.ByteBuffer value) { + validate(fields()[5], value); + this.maybe_binary_column = value; + fieldSetFlags()[5] = true; + return this; + } + + /** Checks whether the 'maybe_binary_column' field has been set */ + public boolean hasMaybeBinaryColumn() { + return fieldSetFlags()[5]; + } + + /** Clears the value of the 'maybe_binary_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeBinaryColumn() { + maybe_binary_column = null; + fieldSetFlags()[5] = false; + return this; + } + + /** Gets the value of the 'maybe_string_column' field */ + public java.lang.String getMaybeStringColumn() { + return maybe_string_column; + } + + /** Sets the value of the 'maybe_string_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeStringColumn(java.lang.String value) { + validate(fields()[6], value); + this.maybe_string_column = value; + fieldSetFlags()[6] = true; + return this; + } + + /** Checks whether the 'maybe_string_column' field has been set */ + public boolean hasMaybeStringColumn() { + return fieldSetFlags()[6]; + } + + /** Clears the value of the 'maybe_string_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeStringColumn() { + maybe_string_column = null; + fieldSetFlags()[6] = false; + return this; + } + + @Override + public AvroOptionalPrimitives build() { + try { + AvroOptionalPrimitives record = new AvroOptionalPrimitives(); + record.maybe_bool_column = fieldSetFlags()[0] ? this.maybe_bool_column : (java.lang.Boolean) defaultValue(fields()[0]); + record.maybe_int_column = fieldSetFlags()[1] ? this.maybe_int_column : (java.lang.Integer) defaultValue(fields()[1]); + record.maybe_long_column = fieldSetFlags()[2] ? this.maybe_long_column : (java.lang.Long) defaultValue(fields()[2]); + record.maybe_float_column = fieldSetFlags()[3] ? this.maybe_float_column : (java.lang.Float) defaultValue(fields()[3]); + record.maybe_double_column = fieldSetFlags()[4] ? this.maybe_double_column : (java.lang.Double) defaultValue(fields()[4]); + record.maybe_binary_column = fieldSetFlags()[5] ? this.maybe_binary_column : (java.nio.ByteBuffer) defaultValue(fields()[5]); + record.maybe_string_column = fieldSetFlags()[6] ? this.maybe_string_column : (java.lang.String) defaultValue(fields()[6]); + return record; + } catch (Exception e) { + throw new org.apache.avro.AvroRuntimeException(e); + } + } + } +} diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroPrimitives.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroPrimitives.java new file mode 100644 index 000000000000..1c2afed16781 --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroPrimitives.java @@ -0,0 +1,461 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.execution.datasources.parquet.test.avro; +@SuppressWarnings("all") +@org.apache.avro.specific.AvroGenerated +public class AvroPrimitives extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"AvroPrimitives\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}"); + public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } + @Deprecated public boolean bool_column; + @Deprecated public int int_column; + @Deprecated public long long_column; + @Deprecated public float float_column; + @Deprecated public double double_column; + @Deprecated public java.nio.ByteBuffer binary_column; + @Deprecated public java.lang.String string_column; + + /** + * Default constructor. Note that this does not initialize fields + * to their default values from the schema. If that is desired then + * one should use newBuilder(). + */ + public AvroPrimitives() {} + + /** + * All-args constructor. + */ + public AvroPrimitives(java.lang.Boolean bool_column, java.lang.Integer int_column, java.lang.Long long_column, java.lang.Float float_column, java.lang.Double double_column, java.nio.ByteBuffer binary_column, java.lang.String string_column) { + this.bool_column = bool_column; + this.int_column = int_column; + this.long_column = long_column; + this.float_column = float_column; + this.double_column = double_column; + this.binary_column = binary_column; + this.string_column = string_column; + } + + public org.apache.avro.Schema getSchema() { return SCHEMA$; } + // Used by DatumWriter. Applications should not call. + public java.lang.Object get(int field$) { + switch (field$) { + case 0: return bool_column; + case 1: return int_column; + case 2: return long_column; + case 3: return float_column; + case 4: return double_column; + case 5: return binary_column; + case 6: return string_column; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + // Used by DatumReader. Applications should not call. + @SuppressWarnings(value="unchecked") + public void put(int field$, java.lang.Object value$) { + switch (field$) { + case 0: bool_column = (java.lang.Boolean)value$; break; + case 1: int_column = (java.lang.Integer)value$; break; + case 2: long_column = (java.lang.Long)value$; break; + case 3: float_column = (java.lang.Float)value$; break; + case 4: double_column = (java.lang.Double)value$; break; + case 5: binary_column = (java.nio.ByteBuffer)value$; break; + case 6: string_column = (java.lang.String)value$; break; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + + /** + * Gets the value of the 'bool_column' field. + */ + public java.lang.Boolean getBoolColumn() { + return bool_column; + } + + /** + * Sets the value of the 'bool_column' field. + * @param value the value to set. + */ + public void setBoolColumn(java.lang.Boolean value) { + this.bool_column = value; + } + + /** + * Gets the value of the 'int_column' field. + */ + public java.lang.Integer getIntColumn() { + return int_column; + } + + /** + * Sets the value of the 'int_column' field. + * @param value the value to set. + */ + public void setIntColumn(java.lang.Integer value) { + this.int_column = value; + } + + /** + * Gets the value of the 'long_column' field. + */ + public java.lang.Long getLongColumn() { + return long_column; + } + + /** + * Sets the value of the 'long_column' field. + * @param value the value to set. + */ + public void setLongColumn(java.lang.Long value) { + this.long_column = value; + } + + /** + * Gets the value of the 'float_column' field. + */ + public java.lang.Float getFloatColumn() { + return float_column; + } + + /** + * Sets the value of the 'float_column' field. + * @param value the value to set. + */ + public void setFloatColumn(java.lang.Float value) { + this.float_column = value; + } + + /** + * Gets the value of the 'double_column' field. + */ + public java.lang.Double getDoubleColumn() { + return double_column; + } + + /** + * Sets the value of the 'double_column' field. + * @param value the value to set. + */ + public void setDoubleColumn(java.lang.Double value) { + this.double_column = value; + } + + /** + * Gets the value of the 'binary_column' field. + */ + public java.nio.ByteBuffer getBinaryColumn() { + return binary_column; + } + + /** + * Sets the value of the 'binary_column' field. + * @param value the value to set. + */ + public void setBinaryColumn(java.nio.ByteBuffer value) { + this.binary_column = value; + } + + /** + * Gets the value of the 'string_column' field. + */ + public java.lang.String getStringColumn() { + return string_column; + } + + /** + * Sets the value of the 'string_column' field. + * @param value the value to set. + */ + public void setStringColumn(java.lang.String value) { + this.string_column = value; + } + + /** Creates a new AvroPrimitives RecordBuilder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder newBuilder() { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder(); + } + + /** Creates a new AvroPrimitives RecordBuilder by copying an existing Builder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder(other); + } + + /** Creates a new AvroPrimitives RecordBuilder by copying an existing AvroPrimitives instance */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder(other); + } + + /** + * RecordBuilder for AvroPrimitives instances. + */ + public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase + implements org.apache.avro.data.RecordBuilder { + + private boolean bool_column; + private int int_column; + private long long_column; + private float float_column; + private double double_column; + private java.nio.ByteBuffer binary_column; + private java.lang.String string_column; + + /** Creates a new Builder */ + private Builder() { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.SCHEMA$); + } + + /** Creates a Builder by copying an existing Builder */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder other) { + super(other); + if (isValidValue(fields()[0], other.bool_column)) { + this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column); + fieldSetFlags()[0] = true; + } + if (isValidValue(fields()[1], other.int_column)) { + this.int_column = data().deepCopy(fields()[1].schema(), other.int_column); + fieldSetFlags()[1] = true; + } + if (isValidValue(fields()[2], other.long_column)) { + this.long_column = data().deepCopy(fields()[2].schema(), other.long_column); + fieldSetFlags()[2] = true; + } + if (isValidValue(fields()[3], other.float_column)) { + this.float_column = data().deepCopy(fields()[3].schema(), other.float_column); + fieldSetFlags()[3] = true; + } + if (isValidValue(fields()[4], other.double_column)) { + this.double_column = data().deepCopy(fields()[4].schema(), other.double_column); + fieldSetFlags()[4] = true; + } + if (isValidValue(fields()[5], other.binary_column)) { + this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column); + fieldSetFlags()[5] = true; + } + if (isValidValue(fields()[6], other.string_column)) { + this.string_column = data().deepCopy(fields()[6].schema(), other.string_column); + fieldSetFlags()[6] = true; + } + } + + /** Creates a Builder by copying an existing AvroPrimitives instance */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives other) { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.SCHEMA$); + if (isValidValue(fields()[0], other.bool_column)) { + this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column); + fieldSetFlags()[0] = true; + } + if (isValidValue(fields()[1], other.int_column)) { + this.int_column = data().deepCopy(fields()[1].schema(), other.int_column); + fieldSetFlags()[1] = true; + } + if (isValidValue(fields()[2], other.long_column)) { + this.long_column = data().deepCopy(fields()[2].schema(), other.long_column); + fieldSetFlags()[2] = true; + } + if (isValidValue(fields()[3], other.float_column)) { + this.float_column = data().deepCopy(fields()[3].schema(), other.float_column); + fieldSetFlags()[3] = true; + } + if (isValidValue(fields()[4], other.double_column)) { + this.double_column = data().deepCopy(fields()[4].schema(), other.double_column); + fieldSetFlags()[4] = true; + } + if (isValidValue(fields()[5], other.binary_column)) { + this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column); + fieldSetFlags()[5] = true; + } + if (isValidValue(fields()[6], other.string_column)) { + this.string_column = data().deepCopy(fields()[6].schema(), other.string_column); + fieldSetFlags()[6] = true; + } + } + + /** Gets the value of the 'bool_column' field */ + public java.lang.Boolean getBoolColumn() { + return bool_column; + } + + /** Sets the value of the 'bool_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setBoolColumn(boolean value) { + validate(fields()[0], value); + this.bool_column = value; + fieldSetFlags()[0] = true; + return this; + } + + /** Checks whether the 'bool_column' field has been set */ + public boolean hasBoolColumn() { + return fieldSetFlags()[0]; + } + + /** Clears the value of the 'bool_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearBoolColumn() { + fieldSetFlags()[0] = false; + return this; + } + + /** Gets the value of the 'int_column' field */ + public java.lang.Integer getIntColumn() { + return int_column; + } + + /** Sets the value of the 'int_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setIntColumn(int value) { + validate(fields()[1], value); + this.int_column = value; + fieldSetFlags()[1] = true; + return this; + } + + /** Checks whether the 'int_column' field has been set */ + public boolean hasIntColumn() { + return fieldSetFlags()[1]; + } + + /** Clears the value of the 'int_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearIntColumn() { + fieldSetFlags()[1] = false; + return this; + } + + /** Gets the value of the 'long_column' field */ + public java.lang.Long getLongColumn() { + return long_column; + } + + /** Sets the value of the 'long_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setLongColumn(long value) { + validate(fields()[2], value); + this.long_column = value; + fieldSetFlags()[2] = true; + return this; + } + + /** Checks whether the 'long_column' field has been set */ + public boolean hasLongColumn() { + return fieldSetFlags()[2]; + } + + /** Clears the value of the 'long_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearLongColumn() { + fieldSetFlags()[2] = false; + return this; + } + + /** Gets the value of the 'float_column' field */ + public java.lang.Float getFloatColumn() { + return float_column; + } + + /** Sets the value of the 'float_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setFloatColumn(float value) { + validate(fields()[3], value); + this.float_column = value; + fieldSetFlags()[3] = true; + return this; + } + + /** Checks whether the 'float_column' field has been set */ + public boolean hasFloatColumn() { + return fieldSetFlags()[3]; + } + + /** Clears the value of the 'float_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearFloatColumn() { + fieldSetFlags()[3] = false; + return this; + } + + /** Gets the value of the 'double_column' field */ + public java.lang.Double getDoubleColumn() { + return double_column; + } + + /** Sets the value of the 'double_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setDoubleColumn(double value) { + validate(fields()[4], value); + this.double_column = value; + fieldSetFlags()[4] = true; + return this; + } + + /** Checks whether the 'double_column' field has been set */ + public boolean hasDoubleColumn() { + return fieldSetFlags()[4]; + } + + /** Clears the value of the 'double_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearDoubleColumn() { + fieldSetFlags()[4] = false; + return this; + } + + /** Gets the value of the 'binary_column' field */ + public java.nio.ByteBuffer getBinaryColumn() { + return binary_column; + } + + /** Sets the value of the 'binary_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setBinaryColumn(java.nio.ByteBuffer value) { + validate(fields()[5], value); + this.binary_column = value; + fieldSetFlags()[5] = true; + return this; + } + + /** Checks whether the 'binary_column' field has been set */ + public boolean hasBinaryColumn() { + return fieldSetFlags()[5]; + } + + /** Clears the value of the 'binary_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearBinaryColumn() { + binary_column = null; + fieldSetFlags()[5] = false; + return this; + } + + /** Gets the value of the 'string_column' field */ + public java.lang.String getStringColumn() { + return string_column; + } + + /** Sets the value of the 'string_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setStringColumn(java.lang.String value) { + validate(fields()[6], value); + this.string_column = value; + fieldSetFlags()[6] = true; + return this; + } + + /** Checks whether the 'string_column' field has been set */ + public boolean hasStringColumn() { + return fieldSetFlags()[6]; + } + + /** Clears the value of the 'string_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearStringColumn() { + string_column = null; + fieldSetFlags()[6] = false; + return this; + } + + @Override + public AvroPrimitives build() { + try { + AvroPrimitives record = new AvroPrimitives(); + record.bool_column = fieldSetFlags()[0] ? this.bool_column : (java.lang.Boolean) defaultValue(fields()[0]); + record.int_column = fieldSetFlags()[1] ? this.int_column : (java.lang.Integer) defaultValue(fields()[1]); + record.long_column = fieldSetFlags()[2] ? this.long_column : (java.lang.Long) defaultValue(fields()[2]); + record.float_column = fieldSetFlags()[3] ? this.float_column : (java.lang.Float) defaultValue(fields()[3]); + record.double_column = fieldSetFlags()[4] ? this.double_column : (java.lang.Double) defaultValue(fields()[4]); + record.binary_column = fieldSetFlags()[5] ? this.binary_column : (java.nio.ByteBuffer) defaultValue(fields()[5]); + record.string_column = fieldSetFlags()[6] ? this.string_column : (java.lang.String) defaultValue(fields()[6]); + return record; + } catch (Exception e) { + throw new org.apache.avro.AvroRuntimeException(e); + } + } + } +} diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/CompatibilityTest.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/CompatibilityTest.java new file mode 100644 index 000000000000..28fdc1dfb911 --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/CompatibilityTest.java @@ -0,0 +1,17 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.execution.datasources.parquet.test.avro; + +@SuppressWarnings("all") +@org.apache.avro.specific.AvroGenerated +public interface CompatibilityTest { + public static final org.apache.avro.Protocol PROTOCOL = org.apache.avro.Protocol.parse("{\"protocol\":\"CompatibilityTest\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"types\":[{\"type\":\"enum\",\"name\":\"Suit\",\"symbols\":[\"SPADES\",\"HEARTS\",\"DIAMONDS\",\"CLUBS\"]},{\"type\":\"record\",\"name\":\"ParquetEnum\",\"fields\":[{\"name\":\"suit\",\"type\":\"Suit\"}]},{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"AvroPrimitives\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"AvroOptionalPrimitives\",\"fields\":[{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]}]},{\"type\":\"record\",\"name\":\"AvroNonNullableArrays\",\"fields\":[{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"maybe_ints_column\",\"type\":[\"null\",{\"type\":\"array\",\"items\":\"int\"}]}]},{\"type\":\"record\",\"name\":\"AvroArrayOfArray\",\"fields\":[{\"name\":\"int_arrays_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"array\",\"items\":\"int\"}}}]},{\"type\":\"record\",\"name\":\"AvroMapOfArray\",\"fields\":[{\"name\":\"string_to_ints_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":\"int\"},\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"fields\":[{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":\"Nested\"},\"avro.java.string\":\"String\"}}]}],\"messages\":{}}"); + + @SuppressWarnings("all") + public interface Callback extends CompatibilityTest { + public static final org.apache.avro.Protocol PROTOCOL = org.apache.spark.sql.execution.datasources.parquet.test.avro.CompatibilityTest.PROTOCOL; + } +} \ No newline at end of file diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/Nested.java similarity index 75% rename from sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java rename to sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/Nested.java index 051f1ee90386..a7bf4841919c 100644 --- a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/Nested.java @@ -3,11 +3,11 @@ * * DO NOT EDIT DIRECTLY */ -package org.apache.spark.sql.parquet.test.avro; +package org.apache.spark.sql.execution.datasources.parquet.test.avro; @SuppressWarnings("all") @org.apache.avro.specific.AvroGenerated public class Nested extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { - public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Nested\",\"namespace\":\"org.apache.spark.sql.parquet.test.avro\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}"); + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Nested\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}"); public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } @Deprecated public java.util.List nested_ints_column; @Deprecated public java.lang.String nested_string_column; @@ -77,18 +77,18 @@ public void setNestedStringColumn(java.lang.String value) { } /** Creates a new Nested RecordBuilder */ - public static org.apache.spark.sql.parquet.test.avro.Nested.Builder newBuilder() { - return new org.apache.spark.sql.parquet.test.avro.Nested.Builder(); + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder newBuilder() { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder(); } /** Creates a new Nested RecordBuilder by copying an existing Builder */ - public static org.apache.spark.sql.parquet.test.avro.Nested.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.Nested.Builder other) { - return new org.apache.spark.sql.parquet.test.avro.Nested.Builder(other); + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder(other); } /** Creates a new Nested RecordBuilder by copying an existing Nested instance */ - public static org.apache.spark.sql.parquet.test.avro.Nested.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.Nested other) { - return new org.apache.spark.sql.parquet.test.avro.Nested.Builder(other); + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder(other); } /** @@ -102,11 +102,11 @@ public static class Builder extends org.apache.avro.specific.SpecificRecordBuild /** Creates a new Builder */ private Builder() { - super(org.apache.spark.sql.parquet.test.avro.Nested.SCHEMA$); + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.SCHEMA$); } /** Creates a Builder by copying an existing Builder */ - private Builder(org.apache.spark.sql.parquet.test.avro.Nested.Builder other) { + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder other) { super(other); if (isValidValue(fields()[0], other.nested_ints_column)) { this.nested_ints_column = data().deepCopy(fields()[0].schema(), other.nested_ints_column); @@ -119,8 +119,8 @@ private Builder(org.apache.spark.sql.parquet.test.avro.Nested.Builder other) { } /** Creates a Builder by copying an existing Nested instance */ - private Builder(org.apache.spark.sql.parquet.test.avro.Nested other) { - super(org.apache.spark.sql.parquet.test.avro.Nested.SCHEMA$); + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested other) { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.SCHEMA$); if (isValidValue(fields()[0], other.nested_ints_column)) { this.nested_ints_column = data().deepCopy(fields()[0].schema(), other.nested_ints_column); fieldSetFlags()[0] = true; @@ -137,7 +137,7 @@ public java.util.List getNestedIntsColumn() { } /** Sets the value of the 'nested_ints_column' field */ - public org.apache.spark.sql.parquet.test.avro.Nested.Builder setNestedIntsColumn(java.util.List value) { + public org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder setNestedIntsColumn(java.util.List value) { validate(fields()[0], value); this.nested_ints_column = value; fieldSetFlags()[0] = true; @@ -150,7 +150,7 @@ public boolean hasNestedIntsColumn() { } /** Clears the value of the 'nested_ints_column' field */ - public org.apache.spark.sql.parquet.test.avro.Nested.Builder clearNestedIntsColumn() { + public org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder clearNestedIntsColumn() { nested_ints_column = null; fieldSetFlags()[0] = false; return this; @@ -162,7 +162,7 @@ public java.lang.String getNestedStringColumn() { } /** Sets the value of the 'nested_string_column' field */ - public org.apache.spark.sql.parquet.test.avro.Nested.Builder setNestedStringColumn(java.lang.String value) { + public org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder setNestedStringColumn(java.lang.String value) { validate(fields()[1], value); this.nested_string_column = value; fieldSetFlags()[1] = true; @@ -175,7 +175,7 @@ public boolean hasNestedStringColumn() { } /** Clears the value of the 'nested_string_column' field */ - public org.apache.spark.sql.parquet.test.avro.Nested.Builder clearNestedStringColumn() { + public org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested.Builder clearNestedStringColumn() { nested_string_column = null; fieldSetFlags()[1] = false; return this; diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/ParquetAvroCompat.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/ParquetAvroCompat.java new file mode 100644 index 000000000000..ef12d193f916 --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/ParquetAvroCompat.java @@ -0,0 +1,250 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.execution.datasources.parquet.test.avro; +@SuppressWarnings("all") +@org.apache.avro.specific.AvroGenerated +public class ParquetAvroCompat extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}},\"avro.java.string\":\"String\"}}]}"); + public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } + @Deprecated public java.util.List strings_column; + @Deprecated public java.util.Map string_to_int_column; + @Deprecated public java.util.Map> complex_column; + + /** + * Default constructor. Note that this does not initialize fields + * to their default values from the schema. If that is desired then + * one should use newBuilder(). + */ + public ParquetAvroCompat() {} + + /** + * All-args constructor. + */ + public ParquetAvroCompat(java.util.List strings_column, java.util.Map string_to_int_column, java.util.Map> complex_column) { + this.strings_column = strings_column; + this.string_to_int_column = string_to_int_column; + this.complex_column = complex_column; + } + + public org.apache.avro.Schema getSchema() { return SCHEMA$; } + // Used by DatumWriter. Applications should not call. + public java.lang.Object get(int field$) { + switch (field$) { + case 0: return strings_column; + case 1: return string_to_int_column; + case 2: return complex_column; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + // Used by DatumReader. Applications should not call. + @SuppressWarnings(value="unchecked") + public void put(int field$, java.lang.Object value$) { + switch (field$) { + case 0: strings_column = (java.util.List)value$; break; + case 1: string_to_int_column = (java.util.Map)value$; break; + case 2: complex_column = (java.util.Map>)value$; break; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + + /** + * Gets the value of the 'strings_column' field. + */ + public java.util.List getStringsColumn() { + return strings_column; + } + + /** + * Sets the value of the 'strings_column' field. + * @param value the value to set. + */ + public void setStringsColumn(java.util.List value) { + this.strings_column = value; + } + + /** + * Gets the value of the 'string_to_int_column' field. + */ + public java.util.Map getStringToIntColumn() { + return string_to_int_column; + } + + /** + * Sets the value of the 'string_to_int_column' field. + * @param value the value to set. + */ + public void setStringToIntColumn(java.util.Map value) { + this.string_to_int_column = value; + } + + /** + * Gets the value of the 'complex_column' field. + */ + public java.util.Map> getComplexColumn() { + return complex_column; + } + + /** + * Sets the value of the 'complex_column' field. + * @param value the value to set. + */ + public void setComplexColumn(java.util.Map> value) { + this.complex_column = value; + } + + /** Creates a new ParquetAvroCompat RecordBuilder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder newBuilder() { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder(); + } + + /** Creates a new ParquetAvroCompat RecordBuilder by copying an existing Builder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder(other); + } + + /** Creates a new ParquetAvroCompat RecordBuilder by copying an existing ParquetAvroCompat instance */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder(other); + } + + /** + * RecordBuilder for ParquetAvroCompat instances. + */ + public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase + implements org.apache.avro.data.RecordBuilder { + + private java.util.List strings_column; + private java.util.Map string_to_int_column; + private java.util.Map> complex_column; + + /** Creates a new Builder */ + private Builder() { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.SCHEMA$); + } + + /** Creates a Builder by copying an existing Builder */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder other) { + super(other); + if (isValidValue(fields()[0], other.strings_column)) { + this.strings_column = data().deepCopy(fields()[0].schema(), other.strings_column); + fieldSetFlags()[0] = true; + } + if (isValidValue(fields()[1], other.string_to_int_column)) { + this.string_to_int_column = data().deepCopy(fields()[1].schema(), other.string_to_int_column); + fieldSetFlags()[1] = true; + } + if (isValidValue(fields()[2], other.complex_column)) { + this.complex_column = data().deepCopy(fields()[2].schema(), other.complex_column); + fieldSetFlags()[2] = true; + } + } + + /** Creates a Builder by copying an existing ParquetAvroCompat instance */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat other) { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.SCHEMA$); + if (isValidValue(fields()[0], other.strings_column)) { + this.strings_column = data().deepCopy(fields()[0].schema(), other.strings_column); + fieldSetFlags()[0] = true; + } + if (isValidValue(fields()[1], other.string_to_int_column)) { + this.string_to_int_column = data().deepCopy(fields()[1].schema(), other.string_to_int_column); + fieldSetFlags()[1] = true; + } + if (isValidValue(fields()[2], other.complex_column)) { + this.complex_column = data().deepCopy(fields()[2].schema(), other.complex_column); + fieldSetFlags()[2] = true; + } + } + + /** Gets the value of the 'strings_column' field */ + public java.util.List getStringsColumn() { + return strings_column; + } + + /** Sets the value of the 'strings_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setStringsColumn(java.util.List value) { + validate(fields()[0], value); + this.strings_column = value; + fieldSetFlags()[0] = true; + return this; + } + + /** Checks whether the 'strings_column' field has been set */ + public boolean hasStringsColumn() { + return fieldSetFlags()[0]; + } + + /** Clears the value of the 'strings_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearStringsColumn() { + strings_column = null; + fieldSetFlags()[0] = false; + return this; + } + + /** Gets the value of the 'string_to_int_column' field */ + public java.util.Map getStringToIntColumn() { + return string_to_int_column; + } + + /** Sets the value of the 'string_to_int_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setStringToIntColumn(java.util.Map value) { + validate(fields()[1], value); + this.string_to_int_column = value; + fieldSetFlags()[1] = true; + return this; + } + + /** Checks whether the 'string_to_int_column' field has been set */ + public boolean hasStringToIntColumn() { + return fieldSetFlags()[1]; + } + + /** Clears the value of the 'string_to_int_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearStringToIntColumn() { + string_to_int_column = null; + fieldSetFlags()[1] = false; + return this; + } + + /** Gets the value of the 'complex_column' field */ + public java.util.Map> getComplexColumn() { + return complex_column; + } + + /** Sets the value of the 'complex_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setComplexColumn(java.util.Map> value) { + validate(fields()[2], value); + this.complex_column = value; + fieldSetFlags()[2] = true; + return this; + } + + /** Checks whether the 'complex_column' field has been set */ + public boolean hasComplexColumn() { + return fieldSetFlags()[2]; + } + + /** Clears the value of the 'complex_column' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearComplexColumn() { + complex_column = null; + fieldSetFlags()[2] = false; + return this; + } + + @Override + public ParquetAvroCompat build() { + try { + ParquetAvroCompat record = new ParquetAvroCompat(); + record.strings_column = fieldSetFlags()[0] ? this.strings_column : (java.util.List) defaultValue(fields()[0]); + record.string_to_int_column = fieldSetFlags()[1] ? this.string_to_int_column : (java.util.Map) defaultValue(fields()[1]); + record.complex_column = fieldSetFlags()[2] ? this.complex_column : (java.util.Map>) defaultValue(fields()[2]); + return record; + } catch (Exception e) { + throw new org.apache.avro.AvroRuntimeException(e); + } + } + } +} diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/ParquetEnum.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/ParquetEnum.java new file mode 100644 index 000000000000..05fefe4cee75 --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/ParquetEnum.java @@ -0,0 +1,142 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.execution.datasources.parquet.test.avro; +@SuppressWarnings("all") +@org.apache.avro.specific.AvroGenerated +public class ParquetEnum extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"ParquetEnum\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"suit\",\"type\":{\"type\":\"enum\",\"name\":\"Suit\",\"symbols\":[\"SPADES\",\"HEARTS\",\"DIAMONDS\",\"CLUBS\"]}}]}"); + public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } + @Deprecated public org.apache.spark.sql.execution.datasources.parquet.test.avro.Suit suit; + + /** + * Default constructor. Note that this does not initialize fields + * to their default values from the schema. If that is desired then + * one should use newBuilder(). + */ + public ParquetEnum() {} + + /** + * All-args constructor. + */ + public ParquetEnum(org.apache.spark.sql.execution.datasources.parquet.test.avro.Suit suit) { + this.suit = suit; + } + + public org.apache.avro.Schema getSchema() { return SCHEMA$; } + // Used by DatumWriter. Applications should not call. + public java.lang.Object get(int field$) { + switch (field$) { + case 0: return suit; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + // Used by DatumReader. Applications should not call. + @SuppressWarnings(value="unchecked") + public void put(int field$, java.lang.Object value$) { + switch (field$) { + case 0: suit = (org.apache.spark.sql.execution.datasources.parquet.test.avro.Suit)value$; break; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + + /** + * Gets the value of the 'suit' field. + */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.Suit getSuit() { + return suit; + } + + /** + * Sets the value of the 'suit' field. + * @param value the value to set. + */ + public void setSuit(org.apache.spark.sql.execution.datasources.parquet.test.avro.Suit value) { + this.suit = value; + } + + /** Creates a new ParquetEnum RecordBuilder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.Builder newBuilder() { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.Builder(); + } + + /** Creates a new ParquetEnum RecordBuilder by copying an existing Builder */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.Builder other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.Builder(other); + } + + /** Creates a new ParquetEnum RecordBuilder by copying an existing ParquetEnum instance */ + public static org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum other) { + return new org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.Builder(other); + } + + /** + * RecordBuilder for ParquetEnum instances. + */ + public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase + implements org.apache.avro.data.RecordBuilder { + + private org.apache.spark.sql.execution.datasources.parquet.test.avro.Suit suit; + + /** Creates a new Builder */ + private Builder() { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.SCHEMA$); + } + + /** Creates a Builder by copying an existing Builder */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.Builder other) { + super(other); + if (isValidValue(fields()[0], other.suit)) { + this.suit = data().deepCopy(fields()[0].schema(), other.suit); + fieldSetFlags()[0] = true; + } + } + + /** Creates a Builder by copying an existing ParquetEnum instance */ + private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum other) { + super(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.SCHEMA$); + if (isValidValue(fields()[0], other.suit)) { + this.suit = data().deepCopy(fields()[0].schema(), other.suit); + fieldSetFlags()[0] = true; + } + } + + /** Gets the value of the 'suit' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.Suit getSuit() { + return suit; + } + + /** Sets the value of the 'suit' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.Builder setSuit(org.apache.spark.sql.execution.datasources.parquet.test.avro.Suit value) { + validate(fields()[0], value); + this.suit = value; + fieldSetFlags()[0] = true; + return this; + } + + /** Checks whether the 'suit' field has been set */ + public boolean hasSuit() { + return fieldSetFlags()[0]; + } + + /** Clears the value of the 'suit' field */ + public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetEnum.Builder clearSuit() { + suit = null; + fieldSetFlags()[0] = false; + return this; + } + + @Override + public ParquetEnum build() { + try { + ParquetEnum record = new ParquetEnum(); + record.suit = fieldSetFlags()[0] ? this.suit : (org.apache.spark.sql.execution.datasources.parquet.test.avro.Suit) defaultValue(fields()[0]); + return record; + } catch (Exception e) { + throw new org.apache.avro.AvroRuntimeException(e); + } + } + } +} diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/Suit.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/Suit.java new file mode 100644 index 000000000000..00711a0c2a26 --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/Suit.java @@ -0,0 +1,13 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.execution.datasources.parquet.test.avro; +@SuppressWarnings("all") +@org.apache.avro.specific.AvroGenerated +public enum Suit { + SPADES, HEARTS, DIAMONDS, CLUBS ; + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"enum\",\"name\":\"Suit\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"symbols\":[\"SPADES\",\"HEARTS\",\"DIAMONDS\",\"CLUBS\"]}"); + public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } +} diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java deleted file mode 100644 index daec65a5bbe5..000000000000 --- a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java +++ /dev/null @@ -1,17 +0,0 @@ -/** - * Autogenerated by Avro - * - * DO NOT EDIT DIRECTLY - */ -package org.apache.spark.sql.parquet.test.avro; - -@SuppressWarnings("all") -@org.apache.avro.specific.AvroGenerated -public interface CompatibilityTest { - public static final org.apache.avro.Protocol PROTOCOL = org.apache.avro.Protocol.parse("{\"protocol\":\"CompatibilityTest\",\"namespace\":\"org.apache.spark.sql.parquet.test.avro\",\"types\":[{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]},{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":\"Nested\"},\"avro.java.string\":\"String\"}}]}],\"messages\":{}}"); - - @SuppressWarnings("all") - public interface Callback extends CompatibilityTest { - public static final org.apache.avro.Protocol PROTOCOL = org.apache.spark.sql.parquet.test.avro.CompatibilityTest.PROTOCOL; - } -} \ No newline at end of file diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/ParquetAvroCompat.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/ParquetAvroCompat.java deleted file mode 100644 index 354c9d73cca3..000000000000 --- a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/ParquetAvroCompat.java +++ /dev/null @@ -1,1001 +0,0 @@ -/** - * Autogenerated by Avro - * - * DO NOT EDIT DIRECTLY - */ -package org.apache.spark.sql.parquet.test.avro; -@SuppressWarnings("all") -@org.apache.avro.specific.AvroGenerated -public class ParquetAvroCompat extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { - public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"namespace\":\"org.apache.spark.sql.parquet.test.avro\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]},{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}},\"avro.java.string\":\"String\"}}]}"); - public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } - @Deprecated public boolean bool_column; - @Deprecated public int int_column; - @Deprecated public long long_column; - @Deprecated public float float_column; - @Deprecated public double double_column; - @Deprecated public java.nio.ByteBuffer binary_column; - @Deprecated public java.lang.String string_column; - @Deprecated public java.lang.Boolean maybe_bool_column; - @Deprecated public java.lang.Integer maybe_int_column; - @Deprecated public java.lang.Long maybe_long_column; - @Deprecated public java.lang.Float maybe_float_column; - @Deprecated public java.lang.Double maybe_double_column; - @Deprecated public java.nio.ByteBuffer maybe_binary_column; - @Deprecated public java.lang.String maybe_string_column; - @Deprecated public java.util.List strings_column; - @Deprecated public java.util.Map string_to_int_column; - @Deprecated public java.util.Map> complex_column; - - /** - * Default constructor. Note that this does not initialize fields - * to their default values from the schema. If that is desired then - * one should use newBuilder(). - */ - public ParquetAvroCompat() {} - - /** - * All-args constructor. - */ - public ParquetAvroCompat(java.lang.Boolean bool_column, java.lang.Integer int_column, java.lang.Long long_column, java.lang.Float float_column, java.lang.Double double_column, java.nio.ByteBuffer binary_column, java.lang.String string_column, java.lang.Boolean maybe_bool_column, java.lang.Integer maybe_int_column, java.lang.Long maybe_long_column, java.lang.Float maybe_float_column, java.lang.Double maybe_double_column, java.nio.ByteBuffer maybe_binary_column, java.lang.String maybe_string_column, java.util.List strings_column, java.util.Map string_to_int_column, java.util.Map> complex_column) { - this.bool_column = bool_column; - this.int_column = int_column; - this.long_column = long_column; - this.float_column = float_column; - this.double_column = double_column; - this.binary_column = binary_column; - this.string_column = string_column; - this.maybe_bool_column = maybe_bool_column; - this.maybe_int_column = maybe_int_column; - this.maybe_long_column = maybe_long_column; - this.maybe_float_column = maybe_float_column; - this.maybe_double_column = maybe_double_column; - this.maybe_binary_column = maybe_binary_column; - this.maybe_string_column = maybe_string_column; - this.strings_column = strings_column; - this.string_to_int_column = string_to_int_column; - this.complex_column = complex_column; - } - - public org.apache.avro.Schema getSchema() { return SCHEMA$; } - // Used by DatumWriter. Applications should not call. - public java.lang.Object get(int field$) { - switch (field$) { - case 0: return bool_column; - case 1: return int_column; - case 2: return long_column; - case 3: return float_column; - case 4: return double_column; - case 5: return binary_column; - case 6: return string_column; - case 7: return maybe_bool_column; - case 8: return maybe_int_column; - case 9: return maybe_long_column; - case 10: return maybe_float_column; - case 11: return maybe_double_column; - case 12: return maybe_binary_column; - case 13: return maybe_string_column; - case 14: return strings_column; - case 15: return string_to_int_column; - case 16: return complex_column; - default: throw new org.apache.avro.AvroRuntimeException("Bad index"); - } - } - // Used by DatumReader. Applications should not call. - @SuppressWarnings(value="unchecked") - public void put(int field$, java.lang.Object value$) { - switch (field$) { - case 0: bool_column = (java.lang.Boolean)value$; break; - case 1: int_column = (java.lang.Integer)value$; break; - case 2: long_column = (java.lang.Long)value$; break; - case 3: float_column = (java.lang.Float)value$; break; - case 4: double_column = (java.lang.Double)value$; break; - case 5: binary_column = (java.nio.ByteBuffer)value$; break; - case 6: string_column = (java.lang.String)value$; break; - case 7: maybe_bool_column = (java.lang.Boolean)value$; break; - case 8: maybe_int_column = (java.lang.Integer)value$; break; - case 9: maybe_long_column = (java.lang.Long)value$; break; - case 10: maybe_float_column = (java.lang.Float)value$; break; - case 11: maybe_double_column = (java.lang.Double)value$; break; - case 12: maybe_binary_column = (java.nio.ByteBuffer)value$; break; - case 13: maybe_string_column = (java.lang.String)value$; break; - case 14: strings_column = (java.util.List)value$; break; - case 15: string_to_int_column = (java.util.Map)value$; break; - case 16: complex_column = (java.util.Map>)value$; break; - default: throw new org.apache.avro.AvroRuntimeException("Bad index"); - } - } - - /** - * Gets the value of the 'bool_column' field. - */ - public java.lang.Boolean getBoolColumn() { - return bool_column; - } - - /** - * Sets the value of the 'bool_column' field. - * @param value the value to set. - */ - public void setBoolColumn(java.lang.Boolean value) { - this.bool_column = value; - } - - /** - * Gets the value of the 'int_column' field. - */ - public java.lang.Integer getIntColumn() { - return int_column; - } - - /** - * Sets the value of the 'int_column' field. - * @param value the value to set. - */ - public void setIntColumn(java.lang.Integer value) { - this.int_column = value; - } - - /** - * Gets the value of the 'long_column' field. - */ - public java.lang.Long getLongColumn() { - return long_column; - } - - /** - * Sets the value of the 'long_column' field. - * @param value the value to set. - */ - public void setLongColumn(java.lang.Long value) { - this.long_column = value; - } - - /** - * Gets the value of the 'float_column' field. - */ - public java.lang.Float getFloatColumn() { - return float_column; - } - - /** - * Sets the value of the 'float_column' field. - * @param value the value to set. - */ - public void setFloatColumn(java.lang.Float value) { - this.float_column = value; - } - - /** - * Gets the value of the 'double_column' field. - */ - public java.lang.Double getDoubleColumn() { - return double_column; - } - - /** - * Sets the value of the 'double_column' field. - * @param value the value to set. - */ - public void setDoubleColumn(java.lang.Double value) { - this.double_column = value; - } - - /** - * Gets the value of the 'binary_column' field. - */ - public java.nio.ByteBuffer getBinaryColumn() { - return binary_column; - } - - /** - * Sets the value of the 'binary_column' field. - * @param value the value to set. - */ - public void setBinaryColumn(java.nio.ByteBuffer value) { - this.binary_column = value; - } - - /** - * Gets the value of the 'string_column' field. - */ - public java.lang.String getStringColumn() { - return string_column; - } - - /** - * Sets the value of the 'string_column' field. - * @param value the value to set. - */ - public void setStringColumn(java.lang.String value) { - this.string_column = value; - } - - /** - * Gets the value of the 'maybe_bool_column' field. - */ - public java.lang.Boolean getMaybeBoolColumn() { - return maybe_bool_column; - } - - /** - * Sets the value of the 'maybe_bool_column' field. - * @param value the value to set. - */ - public void setMaybeBoolColumn(java.lang.Boolean value) { - this.maybe_bool_column = value; - } - - /** - * Gets the value of the 'maybe_int_column' field. - */ - public java.lang.Integer getMaybeIntColumn() { - return maybe_int_column; - } - - /** - * Sets the value of the 'maybe_int_column' field. - * @param value the value to set. - */ - public void setMaybeIntColumn(java.lang.Integer value) { - this.maybe_int_column = value; - } - - /** - * Gets the value of the 'maybe_long_column' field. - */ - public java.lang.Long getMaybeLongColumn() { - return maybe_long_column; - } - - /** - * Sets the value of the 'maybe_long_column' field. - * @param value the value to set. - */ - public void setMaybeLongColumn(java.lang.Long value) { - this.maybe_long_column = value; - } - - /** - * Gets the value of the 'maybe_float_column' field. - */ - public java.lang.Float getMaybeFloatColumn() { - return maybe_float_column; - } - - /** - * Sets the value of the 'maybe_float_column' field. - * @param value the value to set. - */ - public void setMaybeFloatColumn(java.lang.Float value) { - this.maybe_float_column = value; - } - - /** - * Gets the value of the 'maybe_double_column' field. - */ - public java.lang.Double getMaybeDoubleColumn() { - return maybe_double_column; - } - - /** - * Sets the value of the 'maybe_double_column' field. - * @param value the value to set. - */ - public void setMaybeDoubleColumn(java.lang.Double value) { - this.maybe_double_column = value; - } - - /** - * Gets the value of the 'maybe_binary_column' field. - */ - public java.nio.ByteBuffer getMaybeBinaryColumn() { - return maybe_binary_column; - } - - /** - * Sets the value of the 'maybe_binary_column' field. - * @param value the value to set. - */ - public void setMaybeBinaryColumn(java.nio.ByteBuffer value) { - this.maybe_binary_column = value; - } - - /** - * Gets the value of the 'maybe_string_column' field. - */ - public java.lang.String getMaybeStringColumn() { - return maybe_string_column; - } - - /** - * Sets the value of the 'maybe_string_column' field. - * @param value the value to set. - */ - public void setMaybeStringColumn(java.lang.String value) { - this.maybe_string_column = value; - } - - /** - * Gets the value of the 'strings_column' field. - */ - public java.util.List getStringsColumn() { - return strings_column; - } - - /** - * Sets the value of the 'strings_column' field. - * @param value the value to set. - */ - public void setStringsColumn(java.util.List value) { - this.strings_column = value; - } - - /** - * Gets the value of the 'string_to_int_column' field. - */ - public java.util.Map getStringToIntColumn() { - return string_to_int_column; - } - - /** - * Sets the value of the 'string_to_int_column' field. - * @param value the value to set. - */ - public void setStringToIntColumn(java.util.Map value) { - this.string_to_int_column = value; - } - - /** - * Gets the value of the 'complex_column' field. - */ - public java.util.Map> getComplexColumn() { - return complex_column; - } - - /** - * Sets the value of the 'complex_column' field. - * @param value the value to set. - */ - public void setComplexColumn(java.util.Map> value) { - this.complex_column = value; - } - - /** Creates a new ParquetAvroCompat RecordBuilder */ - public static org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder newBuilder() { - return new org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder(); - } - - /** Creates a new ParquetAvroCompat RecordBuilder by copying an existing Builder */ - public static org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder other) { - return new org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder(other); - } - - /** Creates a new ParquetAvroCompat RecordBuilder by copying an existing ParquetAvroCompat instance */ - public static org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat other) { - return new org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder(other); - } - - /** - * RecordBuilder for ParquetAvroCompat instances. - */ - public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase - implements org.apache.avro.data.RecordBuilder { - - private boolean bool_column; - private int int_column; - private long long_column; - private float float_column; - private double double_column; - private java.nio.ByteBuffer binary_column; - private java.lang.String string_column; - private java.lang.Boolean maybe_bool_column; - private java.lang.Integer maybe_int_column; - private java.lang.Long maybe_long_column; - private java.lang.Float maybe_float_column; - private java.lang.Double maybe_double_column; - private java.nio.ByteBuffer maybe_binary_column; - private java.lang.String maybe_string_column; - private java.util.List strings_column; - private java.util.Map string_to_int_column; - private java.util.Map> complex_column; - - /** Creates a new Builder */ - private Builder() { - super(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.SCHEMA$); - } - - /** Creates a Builder by copying an existing Builder */ - private Builder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder other) { - super(other); - if (isValidValue(fields()[0], other.bool_column)) { - this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column); - fieldSetFlags()[0] = true; - } - if (isValidValue(fields()[1], other.int_column)) { - this.int_column = data().deepCopy(fields()[1].schema(), other.int_column); - fieldSetFlags()[1] = true; - } - if (isValidValue(fields()[2], other.long_column)) { - this.long_column = data().deepCopy(fields()[2].schema(), other.long_column); - fieldSetFlags()[2] = true; - } - if (isValidValue(fields()[3], other.float_column)) { - this.float_column = data().deepCopy(fields()[3].schema(), other.float_column); - fieldSetFlags()[3] = true; - } - if (isValidValue(fields()[4], other.double_column)) { - this.double_column = data().deepCopy(fields()[4].schema(), other.double_column); - fieldSetFlags()[4] = true; - } - if (isValidValue(fields()[5], other.binary_column)) { - this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column); - fieldSetFlags()[5] = true; - } - if (isValidValue(fields()[6], other.string_column)) { - this.string_column = data().deepCopy(fields()[6].schema(), other.string_column); - fieldSetFlags()[6] = true; - } - if (isValidValue(fields()[7], other.maybe_bool_column)) { - this.maybe_bool_column = data().deepCopy(fields()[7].schema(), other.maybe_bool_column); - fieldSetFlags()[7] = true; - } - if (isValidValue(fields()[8], other.maybe_int_column)) { - this.maybe_int_column = data().deepCopy(fields()[8].schema(), other.maybe_int_column); - fieldSetFlags()[8] = true; - } - if (isValidValue(fields()[9], other.maybe_long_column)) { - this.maybe_long_column = data().deepCopy(fields()[9].schema(), other.maybe_long_column); - fieldSetFlags()[9] = true; - } - if (isValidValue(fields()[10], other.maybe_float_column)) { - this.maybe_float_column = data().deepCopy(fields()[10].schema(), other.maybe_float_column); - fieldSetFlags()[10] = true; - } - if (isValidValue(fields()[11], other.maybe_double_column)) { - this.maybe_double_column = data().deepCopy(fields()[11].schema(), other.maybe_double_column); - fieldSetFlags()[11] = true; - } - if (isValidValue(fields()[12], other.maybe_binary_column)) { - this.maybe_binary_column = data().deepCopy(fields()[12].schema(), other.maybe_binary_column); - fieldSetFlags()[12] = true; - } - if (isValidValue(fields()[13], other.maybe_string_column)) { - this.maybe_string_column = data().deepCopy(fields()[13].schema(), other.maybe_string_column); - fieldSetFlags()[13] = true; - } - if (isValidValue(fields()[14], other.strings_column)) { - this.strings_column = data().deepCopy(fields()[14].schema(), other.strings_column); - fieldSetFlags()[14] = true; - } - if (isValidValue(fields()[15], other.string_to_int_column)) { - this.string_to_int_column = data().deepCopy(fields()[15].schema(), other.string_to_int_column); - fieldSetFlags()[15] = true; - } - if (isValidValue(fields()[16], other.complex_column)) { - this.complex_column = data().deepCopy(fields()[16].schema(), other.complex_column); - fieldSetFlags()[16] = true; - } - } - - /** Creates a Builder by copying an existing ParquetAvroCompat instance */ - private Builder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat other) { - super(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.SCHEMA$); - if (isValidValue(fields()[0], other.bool_column)) { - this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column); - fieldSetFlags()[0] = true; - } - if (isValidValue(fields()[1], other.int_column)) { - this.int_column = data().deepCopy(fields()[1].schema(), other.int_column); - fieldSetFlags()[1] = true; - } - if (isValidValue(fields()[2], other.long_column)) { - this.long_column = data().deepCopy(fields()[2].schema(), other.long_column); - fieldSetFlags()[2] = true; - } - if (isValidValue(fields()[3], other.float_column)) { - this.float_column = data().deepCopy(fields()[3].schema(), other.float_column); - fieldSetFlags()[3] = true; - } - if (isValidValue(fields()[4], other.double_column)) { - this.double_column = data().deepCopy(fields()[4].schema(), other.double_column); - fieldSetFlags()[4] = true; - } - if (isValidValue(fields()[5], other.binary_column)) { - this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column); - fieldSetFlags()[5] = true; - } - if (isValidValue(fields()[6], other.string_column)) { - this.string_column = data().deepCopy(fields()[6].schema(), other.string_column); - fieldSetFlags()[6] = true; - } - if (isValidValue(fields()[7], other.maybe_bool_column)) { - this.maybe_bool_column = data().deepCopy(fields()[7].schema(), other.maybe_bool_column); - fieldSetFlags()[7] = true; - } - if (isValidValue(fields()[8], other.maybe_int_column)) { - this.maybe_int_column = data().deepCopy(fields()[8].schema(), other.maybe_int_column); - fieldSetFlags()[8] = true; - } - if (isValidValue(fields()[9], other.maybe_long_column)) { - this.maybe_long_column = data().deepCopy(fields()[9].schema(), other.maybe_long_column); - fieldSetFlags()[9] = true; - } - if (isValidValue(fields()[10], other.maybe_float_column)) { - this.maybe_float_column = data().deepCopy(fields()[10].schema(), other.maybe_float_column); - fieldSetFlags()[10] = true; - } - if (isValidValue(fields()[11], other.maybe_double_column)) { - this.maybe_double_column = data().deepCopy(fields()[11].schema(), other.maybe_double_column); - fieldSetFlags()[11] = true; - } - if (isValidValue(fields()[12], other.maybe_binary_column)) { - this.maybe_binary_column = data().deepCopy(fields()[12].schema(), other.maybe_binary_column); - fieldSetFlags()[12] = true; - } - if (isValidValue(fields()[13], other.maybe_string_column)) { - this.maybe_string_column = data().deepCopy(fields()[13].schema(), other.maybe_string_column); - fieldSetFlags()[13] = true; - } - if (isValidValue(fields()[14], other.strings_column)) { - this.strings_column = data().deepCopy(fields()[14].schema(), other.strings_column); - fieldSetFlags()[14] = true; - } - if (isValidValue(fields()[15], other.string_to_int_column)) { - this.string_to_int_column = data().deepCopy(fields()[15].schema(), other.string_to_int_column); - fieldSetFlags()[15] = true; - } - if (isValidValue(fields()[16], other.complex_column)) { - this.complex_column = data().deepCopy(fields()[16].schema(), other.complex_column); - fieldSetFlags()[16] = true; - } - } - - /** Gets the value of the 'bool_column' field */ - public java.lang.Boolean getBoolColumn() { - return bool_column; - } - - /** Sets the value of the 'bool_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setBoolColumn(boolean value) { - validate(fields()[0], value); - this.bool_column = value; - fieldSetFlags()[0] = true; - return this; - } - - /** Checks whether the 'bool_column' field has been set */ - public boolean hasBoolColumn() { - return fieldSetFlags()[0]; - } - - /** Clears the value of the 'bool_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearBoolColumn() { - fieldSetFlags()[0] = false; - return this; - } - - /** Gets the value of the 'int_column' field */ - public java.lang.Integer getIntColumn() { - return int_column; - } - - /** Sets the value of the 'int_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setIntColumn(int value) { - validate(fields()[1], value); - this.int_column = value; - fieldSetFlags()[1] = true; - return this; - } - - /** Checks whether the 'int_column' field has been set */ - public boolean hasIntColumn() { - return fieldSetFlags()[1]; - } - - /** Clears the value of the 'int_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearIntColumn() { - fieldSetFlags()[1] = false; - return this; - } - - /** Gets the value of the 'long_column' field */ - public java.lang.Long getLongColumn() { - return long_column; - } - - /** Sets the value of the 'long_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setLongColumn(long value) { - validate(fields()[2], value); - this.long_column = value; - fieldSetFlags()[2] = true; - return this; - } - - /** Checks whether the 'long_column' field has been set */ - public boolean hasLongColumn() { - return fieldSetFlags()[2]; - } - - /** Clears the value of the 'long_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearLongColumn() { - fieldSetFlags()[2] = false; - return this; - } - - /** Gets the value of the 'float_column' field */ - public java.lang.Float getFloatColumn() { - return float_column; - } - - /** Sets the value of the 'float_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setFloatColumn(float value) { - validate(fields()[3], value); - this.float_column = value; - fieldSetFlags()[3] = true; - return this; - } - - /** Checks whether the 'float_column' field has been set */ - public boolean hasFloatColumn() { - return fieldSetFlags()[3]; - } - - /** Clears the value of the 'float_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearFloatColumn() { - fieldSetFlags()[3] = false; - return this; - } - - /** Gets the value of the 'double_column' field */ - public java.lang.Double getDoubleColumn() { - return double_column; - } - - /** Sets the value of the 'double_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setDoubleColumn(double value) { - validate(fields()[4], value); - this.double_column = value; - fieldSetFlags()[4] = true; - return this; - } - - /** Checks whether the 'double_column' field has been set */ - public boolean hasDoubleColumn() { - return fieldSetFlags()[4]; - } - - /** Clears the value of the 'double_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearDoubleColumn() { - fieldSetFlags()[4] = false; - return this; - } - - /** Gets the value of the 'binary_column' field */ - public java.nio.ByteBuffer getBinaryColumn() { - return binary_column; - } - - /** Sets the value of the 'binary_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setBinaryColumn(java.nio.ByteBuffer value) { - validate(fields()[5], value); - this.binary_column = value; - fieldSetFlags()[5] = true; - return this; - } - - /** Checks whether the 'binary_column' field has been set */ - public boolean hasBinaryColumn() { - return fieldSetFlags()[5]; - } - - /** Clears the value of the 'binary_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearBinaryColumn() { - binary_column = null; - fieldSetFlags()[5] = false; - return this; - } - - /** Gets the value of the 'string_column' field */ - public java.lang.String getStringColumn() { - return string_column; - } - - /** Sets the value of the 'string_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setStringColumn(java.lang.String value) { - validate(fields()[6], value); - this.string_column = value; - fieldSetFlags()[6] = true; - return this; - } - - /** Checks whether the 'string_column' field has been set */ - public boolean hasStringColumn() { - return fieldSetFlags()[6]; - } - - /** Clears the value of the 'string_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearStringColumn() { - string_column = null; - fieldSetFlags()[6] = false; - return this; - } - - /** Gets the value of the 'maybe_bool_column' field */ - public java.lang.Boolean getMaybeBoolColumn() { - return maybe_bool_column; - } - - /** Sets the value of the 'maybe_bool_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeBoolColumn(java.lang.Boolean value) { - validate(fields()[7], value); - this.maybe_bool_column = value; - fieldSetFlags()[7] = true; - return this; - } - - /** Checks whether the 'maybe_bool_column' field has been set */ - public boolean hasMaybeBoolColumn() { - return fieldSetFlags()[7]; - } - - /** Clears the value of the 'maybe_bool_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeBoolColumn() { - maybe_bool_column = null; - fieldSetFlags()[7] = false; - return this; - } - - /** Gets the value of the 'maybe_int_column' field */ - public java.lang.Integer getMaybeIntColumn() { - return maybe_int_column; - } - - /** Sets the value of the 'maybe_int_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeIntColumn(java.lang.Integer value) { - validate(fields()[8], value); - this.maybe_int_column = value; - fieldSetFlags()[8] = true; - return this; - } - - /** Checks whether the 'maybe_int_column' field has been set */ - public boolean hasMaybeIntColumn() { - return fieldSetFlags()[8]; - } - - /** Clears the value of the 'maybe_int_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeIntColumn() { - maybe_int_column = null; - fieldSetFlags()[8] = false; - return this; - } - - /** Gets the value of the 'maybe_long_column' field */ - public java.lang.Long getMaybeLongColumn() { - return maybe_long_column; - } - - /** Sets the value of the 'maybe_long_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeLongColumn(java.lang.Long value) { - validate(fields()[9], value); - this.maybe_long_column = value; - fieldSetFlags()[9] = true; - return this; - } - - /** Checks whether the 'maybe_long_column' field has been set */ - public boolean hasMaybeLongColumn() { - return fieldSetFlags()[9]; - } - - /** Clears the value of the 'maybe_long_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeLongColumn() { - maybe_long_column = null; - fieldSetFlags()[9] = false; - return this; - } - - /** Gets the value of the 'maybe_float_column' field */ - public java.lang.Float getMaybeFloatColumn() { - return maybe_float_column; - } - - /** Sets the value of the 'maybe_float_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeFloatColumn(java.lang.Float value) { - validate(fields()[10], value); - this.maybe_float_column = value; - fieldSetFlags()[10] = true; - return this; - } - - /** Checks whether the 'maybe_float_column' field has been set */ - public boolean hasMaybeFloatColumn() { - return fieldSetFlags()[10]; - } - - /** Clears the value of the 'maybe_float_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeFloatColumn() { - maybe_float_column = null; - fieldSetFlags()[10] = false; - return this; - } - - /** Gets the value of the 'maybe_double_column' field */ - public java.lang.Double getMaybeDoubleColumn() { - return maybe_double_column; - } - - /** Sets the value of the 'maybe_double_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeDoubleColumn(java.lang.Double value) { - validate(fields()[11], value); - this.maybe_double_column = value; - fieldSetFlags()[11] = true; - return this; - } - - /** Checks whether the 'maybe_double_column' field has been set */ - public boolean hasMaybeDoubleColumn() { - return fieldSetFlags()[11]; - } - - /** Clears the value of the 'maybe_double_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeDoubleColumn() { - maybe_double_column = null; - fieldSetFlags()[11] = false; - return this; - } - - /** Gets the value of the 'maybe_binary_column' field */ - public java.nio.ByteBuffer getMaybeBinaryColumn() { - return maybe_binary_column; - } - - /** Sets the value of the 'maybe_binary_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeBinaryColumn(java.nio.ByteBuffer value) { - validate(fields()[12], value); - this.maybe_binary_column = value; - fieldSetFlags()[12] = true; - return this; - } - - /** Checks whether the 'maybe_binary_column' field has been set */ - public boolean hasMaybeBinaryColumn() { - return fieldSetFlags()[12]; - } - - /** Clears the value of the 'maybe_binary_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeBinaryColumn() { - maybe_binary_column = null; - fieldSetFlags()[12] = false; - return this; - } - - /** Gets the value of the 'maybe_string_column' field */ - public java.lang.String getMaybeStringColumn() { - return maybe_string_column; - } - - /** Sets the value of the 'maybe_string_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeStringColumn(java.lang.String value) { - validate(fields()[13], value); - this.maybe_string_column = value; - fieldSetFlags()[13] = true; - return this; - } - - /** Checks whether the 'maybe_string_column' field has been set */ - public boolean hasMaybeStringColumn() { - return fieldSetFlags()[13]; - } - - /** Clears the value of the 'maybe_string_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeStringColumn() { - maybe_string_column = null; - fieldSetFlags()[13] = false; - return this; - } - - /** Gets the value of the 'strings_column' field */ - public java.util.List getStringsColumn() { - return strings_column; - } - - /** Sets the value of the 'strings_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setStringsColumn(java.util.List value) { - validate(fields()[14], value); - this.strings_column = value; - fieldSetFlags()[14] = true; - return this; - } - - /** Checks whether the 'strings_column' field has been set */ - public boolean hasStringsColumn() { - return fieldSetFlags()[14]; - } - - /** Clears the value of the 'strings_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearStringsColumn() { - strings_column = null; - fieldSetFlags()[14] = false; - return this; - } - - /** Gets the value of the 'string_to_int_column' field */ - public java.util.Map getStringToIntColumn() { - return string_to_int_column; - } - - /** Sets the value of the 'string_to_int_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setStringToIntColumn(java.util.Map value) { - validate(fields()[15], value); - this.string_to_int_column = value; - fieldSetFlags()[15] = true; - return this; - } - - /** Checks whether the 'string_to_int_column' field has been set */ - public boolean hasStringToIntColumn() { - return fieldSetFlags()[15]; - } - - /** Clears the value of the 'string_to_int_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearStringToIntColumn() { - string_to_int_column = null; - fieldSetFlags()[15] = false; - return this; - } - - /** Gets the value of the 'complex_column' field */ - public java.util.Map> getComplexColumn() { - return complex_column; - } - - /** Sets the value of the 'complex_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setComplexColumn(java.util.Map> value) { - validate(fields()[16], value); - this.complex_column = value; - fieldSetFlags()[16] = true; - return this; - } - - /** Checks whether the 'complex_column' field has been set */ - public boolean hasComplexColumn() { - return fieldSetFlags()[16]; - } - - /** Clears the value of the 'complex_column' field */ - public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearComplexColumn() { - complex_column = null; - fieldSetFlags()[16] = false; - return this; - } - - @Override - public ParquetAvroCompat build() { - try { - ParquetAvroCompat record = new ParquetAvroCompat(); - record.bool_column = fieldSetFlags()[0] ? this.bool_column : (java.lang.Boolean) defaultValue(fields()[0]); - record.int_column = fieldSetFlags()[1] ? this.int_column : (java.lang.Integer) defaultValue(fields()[1]); - record.long_column = fieldSetFlags()[2] ? this.long_column : (java.lang.Long) defaultValue(fields()[2]); - record.float_column = fieldSetFlags()[3] ? this.float_column : (java.lang.Float) defaultValue(fields()[3]); - record.double_column = fieldSetFlags()[4] ? this.double_column : (java.lang.Double) defaultValue(fields()[4]); - record.binary_column = fieldSetFlags()[5] ? this.binary_column : (java.nio.ByteBuffer) defaultValue(fields()[5]); - record.string_column = fieldSetFlags()[6] ? this.string_column : (java.lang.String) defaultValue(fields()[6]); - record.maybe_bool_column = fieldSetFlags()[7] ? this.maybe_bool_column : (java.lang.Boolean) defaultValue(fields()[7]); - record.maybe_int_column = fieldSetFlags()[8] ? this.maybe_int_column : (java.lang.Integer) defaultValue(fields()[8]); - record.maybe_long_column = fieldSetFlags()[9] ? this.maybe_long_column : (java.lang.Long) defaultValue(fields()[9]); - record.maybe_float_column = fieldSetFlags()[10] ? this.maybe_float_column : (java.lang.Float) defaultValue(fields()[10]); - record.maybe_double_column = fieldSetFlags()[11] ? this.maybe_double_column : (java.lang.Double) defaultValue(fields()[11]); - record.maybe_binary_column = fieldSetFlags()[12] ? this.maybe_binary_column : (java.nio.ByteBuffer) defaultValue(fields()[12]); - record.maybe_string_column = fieldSetFlags()[13] ? this.maybe_string_column : (java.lang.String) defaultValue(fields()[13]); - record.strings_column = fieldSetFlags()[14] ? this.strings_column : (java.util.List) defaultValue(fields()[14]); - record.string_to_int_column = fieldSetFlags()[15] ? this.string_to_int_column : (java.util.Map) defaultValue(fields()[15]); - record.complex_column = fieldSetFlags()[16] ? this.complex_column : (java.util.Map>) defaultValue(fields()[16]); - return record; - } catch (Exception e) { - throw new org.apache.avro.AvroRuntimeException(e); - } - } - } -} diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java index cb84e78d628c..bf693c7c393f 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java @@ -27,6 +27,7 @@ import org.junit.Before; import org.junit.Test; +import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; @@ -34,7 +35,6 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.test.TestSQLContext$; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; @@ -48,14 +48,16 @@ public class JavaApplySchemaSuite implements Serializable { @Before public void setUp() { - sqlContext = TestSQLContext$.MODULE$; - javaCtx = new JavaSparkContext(sqlContext.sparkContext()); + SparkContext context = new SparkContext("local[*]", "testing"); + javaCtx = new JavaSparkContext(context); + sqlContext = new SQLContext(context); } @After public void tearDown() { - javaCtx = null; + sqlContext.sparkContext().stop(); sqlContext = null; + javaCtx = null; } public static class Person implements Serializable { @@ -164,7 +166,7 @@ public void applySchemaToJSON() { "\"bigInteger\":92233720368547758069, \"double\":1.7976931348623157E305, " + "\"boolean\":false, \"null\":null}")); List fields = new ArrayList(7); - fields.add(DataTypes.createStructField("bigInteger", DataTypes.createDecimalType(38, 18), + fields.add(DataTypes.createStructField("bigInteger", DataTypes.createDecimalType(20, 0), true)); fields.add(DataTypes.createStructField("boolean", DataTypes.BooleanType, true)); fields.add(DataTypes.createStructField("double", DataTypes.DoubleType, true)); diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java index 2c669bb59a0b..7abdd3db8034 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java @@ -17,44 +17,45 @@ package test.org.apache.spark.sql; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +import scala.collection.JavaConversions; +import scala.collection.Seq; + import com.google.common.collect.ImmutableMap; import com.google.common.primitives.Ints; +import org.junit.*; +import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.*; +import static org.apache.spark.sql.functions.*; import org.apache.spark.sql.test.TestSQLContext; -import org.apache.spark.sql.test.TestSQLContext$; import org.apache.spark.sql.types.*; -import org.junit.*; - -import scala.collection.JavaConversions; -import scala.collection.Seq; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Map; - -import static org.apache.spark.sql.functions.*; public class JavaDataFrameSuite { private transient JavaSparkContext jsc; - private transient SQLContext context; + private transient TestSQLContext context; @Before public void setUp() { // Trigger static initializer of TestData - TestData$.MODULE$.testData(); - jsc = new JavaSparkContext(TestSQLContext.sparkContext()); - context = TestSQLContext$.MODULE$; + SparkContext sc = new SparkContext("local[*]", "testing"); + jsc = new JavaSparkContext(sc); + context = new TestSQLContext(sc); + context.loadTestData(); } @After public void tearDown() { - jsc = null; + context.sparkContext().stop(); context = null; + jsc = null; } @Test @@ -167,6 +168,7 @@ public void testCreateDataFrameFromJavaBeans() { for (int i = 0; i < result.length(); i++) { Assert.assertEquals(bean.getB()[i], result.apply(i)); } + @SuppressWarnings("unchecked") Seq outputBuffer = (Seq) first.getJavaMap(2).get("hello"); Assert.assertArrayEquals( bean.getC().get("hello"), @@ -229,7 +231,7 @@ public void testCovariance() { @Test public void testSampleBy() { - DataFrame df = context.range(0, 100).select(col("id").mod(3).as("key")); + DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); DataFrame sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); Row[] actual = sampled.groupBy("key").count().orderBy("key").collect(); Row[] expected = new Row[] {RowFactory.create(0, 5), RowFactory.create(1, 8)}; diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java index 79d92734ff37..bb02b58cca9b 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java @@ -23,12 +23,12 @@ import org.junit.Before; import org.junit.Test; +import org.apache.spark.SparkContext; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.api.java.UDF1; import org.apache.spark.sql.api.java.UDF2; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.test.TestSQLContext$; import org.apache.spark.sql.types.DataTypes; // The test suite itself is Serializable so that anonymous Function implementations can be @@ -40,12 +40,16 @@ public class JavaUDFSuite implements Serializable { @Before public void setUp() { - sqlContext = TestSQLContext$.MODULE$; - sc = new JavaSparkContext(sqlContext.sparkContext()); + SparkContext _sc = new SparkContext("local[*]", "testing"); + sqlContext = new SQLContext(_sc); + sc = new JavaSparkContext(_sc); } @After public void tearDown() { + sqlContext.sparkContext().stop(); + sqlContext = null; + sc = null; } @SuppressWarnings("unchecked") diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java index 2706e01bd28a..6f9e7f68dc39 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java @@ -21,13 +21,14 @@ import java.io.IOException; import java.util.*; +import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.test.TestSQLContext$; import org.apache.spark.sql.*; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; @@ -52,8 +53,9 @@ private void checkAnswer(DataFrame actual, List expected) { @Before public void setUp() throws IOException { - sqlContext = TestSQLContext$.MODULE$; - sc = new JavaSparkContext(sqlContext.sparkContext()); + SparkContext _sc = new SparkContext("local[*]", "testing"); + sqlContext = new SQLContext(_sc); + sc = new JavaSparkContext(_sc); originalDefaultSource = sqlContext.conf().defaultDataSourceName(); path = @@ -71,6 +73,13 @@ public void setUp() throws IOException { df.registerTempTable("jsonTable"); } + @After + public void tearDown() { + sqlContext.sparkContext().stop(); + sqlContext = null; + sc = null; + } + @Test public void saveAndLoad() { Map options = new HashMap(); diff --git a/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister new file mode 100644 index 000000000000..cfd7889b4ac2 --- /dev/null +++ b/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -0,0 +1,3 @@ +org.apache.spark.sql.sources.FakeSourceOne +org.apache.spark.sql.sources.FakeSourceTwo +org.apache.spark.sql.sources.FakeSourceThree diff --git a/sql/core/src/test/resources/nested-array-struct.parquet b/sql/core/src/test/resources/nested-array-struct.parquet new file mode 100644 index 000000000000..41a43fa35d39 Binary files /dev/null and b/sql/core/src/test/resources/nested-array-struct.parquet differ diff --git a/sql/core/src/test/resources/old-repeated-int.parquet b/sql/core/src/test/resources/old-repeated-int.parquet new file mode 100644 index 000000000000..520922f73ebb Binary files /dev/null and b/sql/core/src/test/resources/old-repeated-int.parquet differ diff --git a/sql/core/src/test/resources/old-repeated-message.parquet b/sql/core/src/test/resources/old-repeated-message.parquet new file mode 100644 index 000000000000..548db9916277 Binary files /dev/null and b/sql/core/src/test/resources/old-repeated-message.parquet differ diff --git a/sql/core/src/test/resources/old-repeated.parquet b/sql/core/src/test/resources/old-repeated.parquet new file mode 100644 index 000000000000..213f1a90291b Binary files /dev/null and b/sql/core/src/test/resources/old-repeated.parquet differ diff --git a/sql/core/src/test/resources/parquet-thrift-compat.snappy.parquet b/sql/core/src/test/resources/parquet-thrift-compat.snappy.parquet old mode 100755 new mode 100644 diff --git a/sql/core/src/test/resources/proto-repeated-string.parquet b/sql/core/src/test/resources/proto-repeated-string.parquet new file mode 100644 index 000000000000..8a7eea601d01 Binary files /dev/null and b/sql/core/src/test/resources/proto-repeated-string.parquet differ diff --git a/sql/core/src/test/resources/proto-repeated-struct.parquet b/sql/core/src/test/resources/proto-repeated-struct.parquet new file mode 100644 index 000000000000..c29eee35c350 Binary files /dev/null and b/sql/core/src/test/resources/proto-repeated-struct.parquet differ diff --git a/sql/core/src/test/resources/proto-struct-with-array-many.parquet b/sql/core/src/test/resources/proto-struct-with-array-many.parquet new file mode 100644 index 000000000000..ff9809675fc0 Binary files /dev/null and b/sql/core/src/test/resources/proto-struct-with-array-many.parquet differ diff --git a/sql/core/src/test/resources/proto-struct-with-array.parquet b/sql/core/src/test/resources/proto-struct-with-array.parquet new file mode 100644 index 000000000000..325a8370ad20 Binary files /dev/null and b/sql/core/src/test/resources/proto-struct-with-array.parquet differ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index eb3e91332206..af7590c3d3c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -18,23 +18,20 @@ package org.apache.spark.sql import scala.concurrent.duration._ -import scala.language.{implicitConversions, postfixOps} +import scala.language.postfixOps import org.scalatest.concurrent.Eventually._ import org.apache.spark.Accumulators -import org.apache.spark.sql.TestData._ import org.apache.spark.sql.columnar._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.storage.{StorageLevel, RDDBlockId} -case class BigData(s: String) +private case class BigData(s: String) -class CachedTableSuite extends QueryTest { - TestData // Load test tables. - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ - import ctx.sql +class CachedTableSuite extends QueryTest with SharedSQLContext { + import testImplicits._ def rddIdOf(tableName: String): Int = { val executedPlan = ctx.table(tableName).queryExecution.executedPlan @@ -50,6 +47,25 @@ class CachedTableSuite extends QueryTest { ctx.sparkContext.env.blockManager.get(RDDBlockId(rddId, 0)).nonEmpty } + test("withColumn doesn't invalidate cached dataframe") { + var evalCount = 0 + val myUDF = udf((x: String) => { evalCount += 1; "result" }) + val df = Seq(("test", 1)).toDF("s", "i").select(myUDF($"s")) + df.cache() + + df.collect() + assert(evalCount === 1) + + df.collect() + assert(evalCount === 1) + + val df2 = df.withColumn("newColumn", lit(1)) + df2.collect() + + // We should not reevaluate the cached dataframe + assert(evalCount === 1) + } + test("cache temp table") { testData.select('key).registerTempTable("tempTable") assertCached(sql("SELECT COUNT(*) FROM tempTable"), 0) @@ -305,12 +321,8 @@ class CachedTableSuite extends QueryTest { sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1") sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2") - Accumulators.synchronized { - val accsSize = Accumulators.originals.size - ctx.cacheTable("t1") - ctx.cacheTable("t2") - assert((accsSize + 2) == Accumulators.originals.size) - } + ctx.cacheTable("t1") + ctx.cacheTable("t2") sql("SELECT * FROM t1").count() sql("SELECT * FROM t2").count() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index 35ca0b4c7cc2..37738ec5b3c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -17,20 +17,93 @@ package org.apache.spark.sql +import org.apache.spark.sql.catalyst.expressions.NamedExpression import org.scalatest.Matchers._ import org.apache.spark.sql.execution.{Project, TungstenProject} import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ -import org.apache.spark.sql.test.SQLTestUtils -class ColumnExpressionSuite extends QueryTest with SQLTestUtils { - import org.apache.spark.sql.TestData._ +class ColumnExpressionSuite extends QueryTest with SharedSQLContext { + import testImplicits._ - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ + private lazy val booleanData = { + ctx.createDataFrame(ctx.sparkContext.parallelize( + Row(false, false) :: + Row(false, true) :: + Row(true, false) :: + Row(true, true) :: Nil), + StructType(Seq(StructField("a", BooleanType), StructField("b", BooleanType)))) + } + + test("column names with space") { + val df = Seq((1, "a")).toDF("name with space", "name.with.dot") + + checkAnswer( + df.select(df("name with space")), + Row(1) :: Nil) + + checkAnswer( + df.select($"name with space"), + Row(1) :: Nil) + + checkAnswer( + df.select(col("name with space")), + Row(1) :: Nil) + + checkAnswer( + df.select("name with space"), + Row(1) :: Nil) + + checkAnswer( + df.select(expr("`name with space`")), + Row(1) :: Nil) + } + + test("column names with dot") { + val df = Seq((1, "a")).toDF("name with space", "name.with.dot").as("a") + + checkAnswer( + df.select(df("`name.with.dot`")), + Row("a") :: Nil) + + checkAnswer( + df.select($"`name.with.dot`"), + Row("a") :: Nil) + + checkAnswer( + df.select(col("`name.with.dot`")), + Row("a") :: Nil) - override def sqlContext(): SQLContext = ctx + checkAnswer( + df.select("`name.with.dot`"), + Row("a") :: Nil) + + checkAnswer( + df.select(expr("`name.with.dot`")), + Row("a") :: Nil) + + checkAnswer( + df.select(df("a.`name.with.dot`")), + Row("a") :: Nil) + + checkAnswer( + df.select($"a.`name.with.dot`"), + Row("a") :: Nil) + + checkAnswer( + df.select(col("a.`name.with.dot`")), + Row("a") :: Nil) + + checkAnswer( + df.select("a.`name.with.dot`"), + Row("a") :: Nil) + + checkAnswer( + df.select(expr("a.`name.with.dot`")), + Row("a") :: Nil) + } test("alias") { val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") @@ -38,6 +111,14 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils { assert(df.select(df("a").alias("b")).columns.head === "b") } + test("as propagates metadata") { + val metadata = new MetadataBuilder + metadata.putString("key", "value") + val origCol = $"a".as("b", metadata.build()) + val newCol = origCol.as("c") + assert(newCol.expr.asInstanceOf[NamedExpression].metadata.getString("key") === "value") + } + test("single explode") { val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") checkAnswer( @@ -190,7 +271,7 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils { nullStrings.collect().toSeq.filter(r => r.getString(1) eq null)) checkAnswer( - ctx.sql("select isnull(null), isnull(1)"), + sql("select isnull(null), isnull(1)"), Row(true, false)) } @@ -200,7 +281,7 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils { nullStrings.collect().toSeq.filter(r => r.getString(1) ne null)) checkAnswer( - ctx.sql("select isnotnull(null), isnotnull('a')"), + sql("select isnotnull(null), isnotnull('a')"), Row(false, true)) } @@ -221,7 +302,7 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils { Row(true, true) :: Row(true, true) :: Row(false, false) :: Row(false, false) :: Nil) checkAnswer( - ctx.sql("select isnan(15), isnan('invalid')"), + sql("select isnan(15), isnan('invalid')"), Row(false, false)) } @@ -241,7 +322,7 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils { ) testData.registerTempTable("t") checkAnswer( - ctx.sql( + sql( "select nanvl(a, 5), nanvl(b, 10), nanvl(10, b), nanvl(c, null), nanvl(d, 10), " + " nanvl(b, e), nanvl(e, f) from t"), Row(null, 3.0, 10.0, null, Double.PositiveInfinity, 3.0, 1.0) @@ -345,26 +426,25 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils { test("in") { val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") - checkAnswer(df.filter($"a".in(1, 2)), + checkAnswer(df.filter($"a".isin(1, 2)), df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2)) - checkAnswer(df.filter($"a".in(3, 2)), + checkAnswer(df.filter($"a".isin(3, 2)), df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 2)) - checkAnswer(df.filter($"a".in(3, 1)), + checkAnswer(df.filter($"a".isin(3, 1)), df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) - checkAnswer(df.filter($"b".in("y", "x")), + checkAnswer(df.filter($"b".isin("y", "x")), df.collect().toSeq.filter(r => r.getString(1) == "y" || r.getString(1) == "x")) - checkAnswer(df.filter($"b".in("z", "x")), + checkAnswer(df.filter($"b".isin("z", "x")), df.collect().toSeq.filter(r => r.getString(1) == "z" || r.getString(1) == "x")) - checkAnswer(df.filter($"b".in("z", "y")), + checkAnswer(df.filter($"b".isin("z", "y")), df.collect().toSeq.filter(r => r.getString(1) == "z" || r.getString(1) == "y")) - } - val booleanData = ctx.createDataFrame(ctx.sparkContext.parallelize( - Row(false, false) :: - Row(false, true) :: - Row(true, false) :: - Row(true, true) :: Nil), - StructType(Seq(StructField("a", BooleanType), StructField("b", BooleanType)))) + val df2 = Seq((1, Seq(1)), (2, Seq(2)), (3, Seq(3))).toDF("a", "b") + + intercept[AnalysisException] { + df2.filter($"a".isin($"b")) + } + } test("&&") { checkAnswer( @@ -449,7 +529,7 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils { ) checkAnswer( - ctx.sql("SELECT upper('aB'), ucase('cDe')"), + sql("SELECT upper('aB'), ucase('cDe')"), Row("AB", "CDE")) } @@ -470,7 +550,7 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils { ) checkAnswer( - ctx.sql("SELECT lower('aB'), lcase('cDe')"), + sql("SELECT lower('aB'), lcase('cDe')"), Row("ab", "cde")) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index f9cff7440a76..72cf7aab0b97 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -17,15 +17,13 @@ package org.apache.spark.sql -import org.apache.spark.sql.TestData._ import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{BinaryType, DecimalType} +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types.DecimalType -class DataFrameAggregateSuite extends QueryTest { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class DataFrameAggregateSuite extends QueryTest with SharedSQLContext { + import testImplicits._ test("groupBy") { checkAnswer( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala new file mode 100644 index 000000000000..3c359dd840ab --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSQLContext + +/** + * A test suite to test DataFrame/SQL functionalities with complex types (i.e. array, struct, map). + */ +class DataFrameComplexTypeSuite extends QueryTest with SharedSQLContext { + import testImplicits._ + + test("UDF on struct") { + val f = udf((a: String) => a) + val df = sqlContext.sparkContext.parallelize(Seq((1, 1))).toDF("a", "b") + df.select(struct($"a").as("s")).select(f($"s.a")).collect() + } + + test("UDF on named_struct") { + val f = udf((a: String) => a) + val df = sqlContext.sparkContext.parallelize(Seq((1, 1))).toDF("a", "b") + df.selectExpr("named_struct('a', a) s").select(f($"s.a")).collect() + } + + test("UDF on array") { + val f = udf((a: String) => a) + val df = sqlContext.sparkContext.parallelize(Seq((1, 1))).toDF("a", "b") + df.select(array($"a").as("s")).select(f(expr("s[0]"))).collect() + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 431dcf7382f1..3a3f19af1473 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -17,17 +17,15 @@ package org.apache.spark.sql -import org.apache.spark.sql.TestData._ import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ /** * Test suite for functions in [[org.apache.spark.sql.functions]]. */ -class DataFrameFunctionsSuite extends QueryTest { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { + import testImplicits._ test("array with column name") { val df = Seq((0, 1)).toDF("a", "b") @@ -119,11 +117,11 @@ class DataFrameFunctionsSuite extends QueryTest { test("constant functions") { checkAnswer( - ctx.sql("SELECT E()"), + sql("SELECT E()"), Row(scala.math.E) ) checkAnswer( - ctx.sql("SELECT PI()"), + sql("SELECT PI()"), Row(scala.math.Pi) ) } @@ -153,7 +151,7 @@ class DataFrameFunctionsSuite extends QueryTest { test("nvl function") { checkAnswer( - ctx.sql("SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null)"), + sql("SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null)"), Row("x", "y", null)) } @@ -208,13 +206,21 @@ class DataFrameFunctionsSuite extends QueryTest { Row(2743272264L, 2180413220L)) } + test("string function find_in_set") { + val df = Seq(("abc,b,ab,c,def", "abc,b,ab,c,def")).toDF("a", "b") + + checkAnswer( + df.selectExpr("find_in_set('ab', a)", "find_in_set('x', b)"), + Row(3, 0)) + } + test("conditional function: least") { checkAnswer( testData2.select(least(lit(-1), lit(0), col("a"), col("b"))).limit(1), Row(-1) ) checkAnswer( - ctx.sql("SELECT least(a, 2) as l from testData2 order by l"), + sql("SELECT least(a, 2) as l from testData2 order by l"), Seq(Row(1), Row(1), Row(2), Row(2), Row(2), Row(2)) ) } @@ -225,7 +231,7 @@ class DataFrameFunctionsSuite extends QueryTest { Row(3) ) checkAnswer( - ctx.sql("SELECT greatest(a, 2) as g from testData2 order by g"), + sql("SELECT greatest(a, 2) as g from testData2 order by g"), Seq(Row(2), Row(2), Row(2), Row(2), Row(3), Row(3)) ) } @@ -315,9 +321,9 @@ class DataFrameFunctionsSuite extends QueryTest { test("array size function") { val df = Seq( - (Array[Int](1, 2), "x"), - (Array[Int](), "y"), - (Array[Int](1, 2, 3), "z") + (Seq[Int](1, 2), "x"), + (Seq[Int](), "y"), + (Seq[Int](1, 2, 3), "z") ).toDF("a", "b") checkAnswer( df.select(size($"a")), @@ -344,4 +350,41 @@ class DataFrameFunctionsSuite extends QueryTest { Seq(Row(2), Row(0), Row(3)) ) } + + test("array contains function") { + val df = Seq( + (Seq[Int](1, 2), "x"), + (Seq[Int](), "x") + ).toDF("a", "b") + + // Simple test cases + checkAnswer( + df.select(array_contains(df("a"), 1)), + Seq(Row(true), Row(false)) + ) + checkAnswer( + df.selectExpr("array_contains(a, 1)"), + Seq(Row(true), Row(false)) + ) + + // In hive, this errors because null has no type information + intercept[AnalysisException] { + df.select(array_contains(df("a"), null)) + } + intercept[AnalysisException] { + df.selectExpr("array_contains(a, null)") + } + intercept[AnalysisException] { + df.selectExpr("array_contains(null, 1)") + } + + checkAnswer( + df.selectExpr("array_contains(array(array(1), null)[0], 1)"), + Seq(Row(true), Row(true)) + ) + checkAnswer( + df.selectExpr("array_contains(array(1, null), array(1, null)[0])"), + Seq(Row(true), Row(true)) + ) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala index fbb30706a494..e5d7d63441a6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala @@ -17,10 +17,10 @@ package org.apache.spark.sql -class DataFrameImplicitsSuite extends QueryTest { +import org.apache.spark.sql.test.SharedSQLContext - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class DataFrameImplicitsSuite extends QueryTest with SharedSQLContext { + import testImplicits._ test("RDD of tuples") { checkAnswer( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index e1c6c706242d..e2716d7841d8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala @@ -17,14 +17,12 @@ package org.apache.spark.sql -import org.apache.spark.sql.TestData._ import org.apache.spark.sql.execution.joins.BroadcastHashJoin import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSQLContext -class DataFrameJoinSuite extends QueryTest { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class DataFrameJoinSuite extends QueryTest with SharedSQLContext { + import testImplicits._ test("join - join using") { val df = Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str") @@ -59,7 +57,7 @@ class DataFrameJoinSuite extends QueryTest { checkAnswer( df1.join(df2, $"df1.key" === $"df2.key"), - ctx.sql("SELECT a.key, b.key FROM testData a JOIN testData b ON a.key = b.key") + sql("SELECT a.key, b.key FROM testData a JOIN testData b ON a.key = b.key") .collect().toSeq) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala index dbe3b44ee2c7..cdaa14ac8078 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala @@ -19,11 +19,11 @@ package org.apache.spark.sql import scala.collection.JavaConversions._ +import org.apache.spark.sql.test.SharedSQLContext -class DataFrameNaFunctionsSuite extends QueryTest { - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class DataFrameNaFunctionsSuite extends QueryTest with SharedSQLContext { + import testImplicits._ def createDF(): DataFrame = { Seq[(String, java.lang.Integer, java.lang.Double)]( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 07a675e64f52..28bdd6f83b68 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -19,17 +19,49 @@ package org.apache.spark.sql import java.util.Random -import org.scalatest.Matchers._ - import org.apache.spark.sql.functions.col +import org.apache.spark.sql.test.SharedSQLContext -class DataFrameStatSuite extends QueryTest { - - private val sqlCtx = org.apache.spark.sql.test.TestSQLContext - import sqlCtx.implicits._ +class DataFrameStatSuite extends QueryTest with SharedSQLContext { + import testImplicits._ private def toLetter(i: Int): String = (i + 97).toChar.toString + test("sample with replacement") { + val n = 100 + val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id") + checkAnswer( + data.sample(withReplacement = true, 0.05, seed = 13), + Seq(5, 10, 52, 73).map(Row(_)) + ) + } + + test("sample without replacement") { + val n = 100 + val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id") + checkAnswer( + data.sample(withReplacement = false, 0.05, seed = 13), + Seq(16, 23, 88, 100).map(Row(_)) + ) + } + + test("randomSplit") { + val n = 600 + val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id") + for (seed <- 1 to 5) { + val splits = data.randomSplit(Array[Double](1, 2, 3), seed) + assert(splits.length == 3, "wrong number of splits") + + assert(splits.reduce((a, b) => a.unionAll(b)).sort("id").collect().toList == + data.collect().toList, "incomplete or wrong split") + + val s = splits.map(_.count()) + assert(math.abs(s(0) - 100) < 50) // std = 9.13 + assert(math.abs(s(1) - 200) < 50) // std = 11.55 + assert(math.abs(s(2) - 300) < 50) // std = 12.25 + } + } + test("pearson correlation") { val df = Seq.tabulate(10)(i => (i, 2 * i, i * -1.0)).toDF("a", "b", "c") val corr1 = df.stat.corr("a", "b", "pearson") @@ -123,16 +155,34 @@ class DataFrameStatSuite extends QueryTest { val results = df.stat.freqItems(Array("numbers", "letters"), 0.1) val items = results.collect().head - items.getSeq[Int](0) should contain (1) - items.getSeq[String](1) should contain (toLetter(1)) + assert(items.getSeq[Int](0).contains(1)) + assert(items.getSeq[String](1).contains(toLetter(1))) val singleColResults = df.stat.freqItems(Array("negDoubles"), 0.1) val items2 = singleColResults.collect().head - items2.getSeq[Double](0) should contain (-1.0) + assert(items2.getSeq[Double](0).contains(-1.0)) + } + + test("Frequent Items 2") { + val rows = ctx.sparkContext.parallelize(Seq.empty[Int], 4) + // this is a regression test, where when merging partitions, we omitted values with higher + // counts than those that existed in the map when the map was full. This test should also fail + // if anything like SPARK-9614 is observed once again + val df = rows.mapPartitionsWithIndex { (idx, iter) => + if (idx == 3) { // must come from one of the later merges, therefore higher partition index + Iterator("3", "3", "3", "3", "3") + } else { + Iterator("0", "1", "2", "3", "4") + } + }.toDF("a") + val results = df.stat.freqItems(Array("a"), 0.25) + val items = results.collect().head.getSeq[String](0) + assert(items.contains("3")) + assert(items.length === 1) } test("sampleBy") { - val df = sqlCtx.range(0, 100).select((col("id") % 3).as("key")) + val df = ctx.range(0, 100).select((col("id") % 3).as("key")) val sampled = df.stat.sampleBy("key", Map(0 -> 0.1, 1 -> 0.2), 0L) checkAnswer( sampled.groupBy("key").count().orderBy("key"), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index aef940a52667..284fff184085 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -23,18 +23,12 @@ import scala.language.postfixOps import scala.util.Random import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation -import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.functions._ -import org.apache.spark.sql.json.JSONRelation -import org.apache.spark.sql.parquet.ParquetRelation import org.apache.spark.sql.types._ -import org.apache.spark.sql.test.{ExamplePointUDT, ExamplePoint, SQLTestUtils} +import org.apache.spark.sql.test.{ExamplePointUDT, ExamplePoint, SharedSQLContext} -class DataFrameSuite extends QueryTest with SQLTestUtils { - import org.apache.spark.sql.TestData._ - - lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext - import sqlContext.implicits._ +class DataFrameSuite extends QueryTest with SharedSQLContext { + import testImplicits._ test("analysis error should be eagerly reported") { // Eager analysis. @@ -134,6 +128,21 @@ class DataFrameSuite extends QueryTest with SQLTestUtils { ) } + test("SPARK-8930: explode should fail with a meaningful message if it takes a star") { + val df = Seq(("1", "1,2"), ("2", "4"), ("3", "7,8,9")).toDF("prefix", "csv") + val e = intercept[AnalysisException] { + df.explode($"*") { case Row(prefix: String, csv: String) => + csv.split(",").map(v => Tuple1(prefix + ":" + v)).toSeq + }.queryExecution.assertAnalyzed() + } + assert(e.getMessage.contains( + "Cannot explode *, explode can only be applied on a specific column.")) + + df.explode('prefix, 'csv) { case Row(prefix: String, csv: String) => + csv.split(",").map(v => Tuple1(prefix + ":" + v)).toSeq + }.queryExecution.assertAnalyzed() + } + test("explode alias and star") { val df = Seq((Array("a"), 1)).toDF("a", "b") @@ -415,23 +424,6 @@ class DataFrameSuite extends QueryTest with SQLTestUtils { assert(df.schema.map(_.name) === Seq("key", "valueRenamed", "newCol")) } - test("randomSplit") { - val n = 600 - val data = sqlContext.sparkContext.parallelize(1 to n, 2).toDF("id") - for (seed <- 1 to 5) { - val splits = data.randomSplit(Array[Double](1, 2, 3), seed) - assert(splits.length == 3, "wrong number of splits") - - assert(splits.reduce((a, b) => a.unionAll(b)).sort("id").collect().toList == - data.collect().toList, "incomplete or wrong split") - - val s = splits.map(_.count()) - assert(math.abs(s(0) - 100) < 50) // std = 9.13 - assert(math.abs(s(1) - 200) < 50) // std = 11.55 - assert(math.abs(s(2) - 300) < 50) // std = 12.25 - } - } - test("describe") { val describeTestData = Seq( ("Bob", 16, 176), @@ -487,20 +479,23 @@ class DataFrameSuite extends QueryTest with SQLTestUtils { } test("inputFiles") { - val fakeRelation1 = new ParquetRelation(Array("/my/path", "/my/other/path"), - Some(testData.schema), None, Map.empty)(sqlContext) - val df1 = DataFrame(sqlContext, LogicalRelation(fakeRelation1)) - assert(df1.inputFiles.toSet == fakeRelation1.paths.toSet) + withTempDir { dir => + val df = Seq((1, 22)).toDF("a", "b") - val fakeRelation2 = new JSONRelation("/json/path", 1, Some(testData.schema), sqlContext) - val df2 = DataFrame(sqlContext, LogicalRelation(fakeRelation2)) - assert(df2.inputFiles.toSet == fakeRelation2.path.toSet) + val parquetDir = new File(dir, "parquet").getCanonicalPath + df.write.parquet(parquetDir) + val parquetDF = sqlContext.read.parquet(parquetDir) + assert(parquetDF.inputFiles.nonEmpty) - val unionDF = df1.unionAll(df2) - assert(unionDF.inputFiles.toSet == fakeRelation1.paths.toSet ++ fakeRelation2.path) + val jsonDir = new File(dir, "json").getCanonicalPath + df.write.json(jsonDir) + val jsonDF = sqlContext.read.json(jsonDir) + assert(parquetDF.inputFiles.nonEmpty) - val filtered = df1.filter("false").unionAll(df2.intersect(df2)) - assert(filtered.inputFiles.toSet == fakeRelation1.paths.toSet ++ fakeRelation2.path) + val unioned = jsonDF.unionAll(parquetDF).inputFiles.sorted + val allFiles = (jsonDF.inputFiles ++ parquetDF.inputFiles).toSet.toArray.sorted + assert(unioned === allFiles) + } } ignore("show") { @@ -685,18 +680,6 @@ class DataFrameSuite extends QueryTest with SQLTestUtils { Seq(Row(2, 1, 2), Row(1, 1, 1))) } - test("SPARK-7276: Project collapse for continuous select") { - var df = testData - for (i <- 1 to 5) { - df = df.select($"*") - } - - import org.apache.spark.sql.catalyst.plans.logical.Project - // make sure df have at most two Projects - val p = df.logicalPlan.asInstanceOf[Project].child.asInstanceOf[Project] - assert(!p.child.isInstanceOf[Project]) - } - test("SPARK-7150 range api") { // numSlice is greater than length val res1 = sqlContext.range(0, 10, 1, 15).select("id") @@ -884,4 +867,24 @@ class DataFrameSuite extends QueryTest with SQLTestUtils { val actual = df.sort(rand(seed)).collect().map(_.getInt(0)) assert(expected === actual) } + + test("SPARK-9323: DataFrame.orderBy should support nested column name") { + val df = sqlContext.read.json(sqlContext.sparkContext.makeRDD( + """{"a": {"b": 1}}""" :: Nil)) + checkAnswer(df.orderBy("a.b"), Row(Row(1))) + } + + test("SPARK-9950: correctly analyze grouping/aggregating on struct fields") { + val df = Seq(("x", (1, 1)), ("y", (2, 2))).toDF("a", "b") + checkAnswer(df.groupBy("b._1").agg(sum("b._2")), Row(1, 1) :: Row(2, 2) :: Nil) + } + + test("SPARK-10093: Avoid transformations on executors") { + val df = Seq((1, 1)).toDF("a", "b") + df.where($"a" === 1) + .select($"a", $"b", struct($"b")) + .orderBy("a") + .select(struct($"b")) + .collect() + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala index bf8ef9a97bc6..77907e91363e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql import org.apache.spark.sql.functions._ -import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ /** @@ -27,10 +27,8 @@ import org.apache.spark.sql.types._ * This is here for now so I can make sure Tungsten project is tested without refactoring existing * end-to-end test infra. In the long run this should just go away. */ -class DataFrameTungstenSuite extends QueryTest with SQLTestUtils { - - override lazy val sqlContext: SQLContext = org.apache.spark.sql.test.TestSQLContext - import sqlContext.implicits._ +class DataFrameTungstenSuite extends QueryTest with SharedSQLContext { + import testImplicits._ test("test simple types") { withSQLConf(SQLConf.UNSAFE_ENABLED.key -> "true") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index 17897caf952a..9080c53c491a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -22,19 +22,18 @@ import java.text.SimpleDateFormat import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.CalendarInterval -class DateFunctionsSuite extends QueryTest { - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - - import ctx.implicits._ +class DateFunctionsSuite extends QueryTest with SharedSQLContext { + import testImplicits._ test("function current_date") { val df1 = Seq((1, 2), (3, 1)).toDF("a", "b") val d0 = DateTimeUtils.millisToDays(System.currentTimeMillis()) val d1 = DateTimeUtils.fromJavaDate(df1.select(current_date()).collect().head.getDate(0)) val d2 = DateTimeUtils.fromJavaDate( - ctx.sql("""SELECT CURRENT_DATE()""").collect().head.getDate(0)) + sql("""SELECT CURRENT_DATE()""").collect().head.getDate(0)) val d3 = DateTimeUtils.millisToDays(System.currentTimeMillis()) assert(d0 <= d1 && d1 <= d2 && d2 <= d3 && d3 - d0 <= 1) } @@ -44,9 +43,9 @@ class DateFunctionsSuite extends QueryTest { val df1 = Seq((1, 2), (3, 1)).toDF("a", "b") checkAnswer(df1.select(countDistinct(current_timestamp())), Row(1)) // Execution in one query should return the same value - checkAnswer(ctx.sql("""SELECT CURRENT_TIMESTAMP() = CURRENT_TIMESTAMP()"""), + checkAnswer(sql("""SELECT CURRENT_TIMESTAMP() = CURRENT_TIMESTAMP()"""), Row(true)) - assert(math.abs(ctx.sql("""SELECT CURRENT_TIMESTAMP()""").collect().head.getTimestamp( + assert(math.abs(sql("""SELECT CURRENT_TIMESTAMP()""").collect().head.getTimestamp( 0).getTime - System.currentTimeMillis()) < 5000) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala new file mode 100644 index 000000000000..8d2f45d70308 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.sql + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} +import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.{Row, Strategy, QueryTest} +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.unsafe.types.UTF8String + +case class FastOperator(output: Seq[Attribute]) extends SparkPlan { + + override protected def doExecute(): RDD[InternalRow] = { + val str = Literal("so fast").value + val row = new GenericInternalRow(Array[Any](str)) + sparkContext.parallelize(Seq(row)) + } + + override def children: Seq[SparkPlan] = Nil +} + +object TestStrategy extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case Project(Seq(attr), _) if attr.name == "a" => + FastOperator(attr.toAttribute :: Nil) :: Nil + case _ => Nil + } +} + +class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { + import testImplicits._ + + test("insert an extraStrategy") { + try { + sqlContext.experimental.extraStrategies = TestStrategy :: Nil + + val df = sqlContext.sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") + checkAnswer( + df.select("a"), + Row("so fast")) + + checkAnswer( + df.select("a", "b"), + Row("so slow", 1)) + } finally { + sqlContext.experimental.extraStrategies = Nil + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index 5bef1d896603..f5c5046a8ed8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -17,32 +17,26 @@ package org.apache.spark.sql -import org.scalatest.BeforeAndAfterEach - -import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.execution.joins._ -import org.apache.spark.sql.types.BinaryType +import org.apache.spark.sql.test.SharedSQLContext -class JoinSuite extends QueryTest with BeforeAndAfterEach { - // Ensures tables are loaded. - TestData +class JoinSuite extends QueryTest with SharedSQLContext { + import testImplicits._ - lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ - import ctx.logicalPlanToSparkQuery + setupTestData() test("equi-join is hash-join") { val x = testData2.as("x") val y = testData2.as("y") val join = x.join(y, $"x.a" === $"y.a", "inner").queryExecution.optimizedPlan - val planned = ctx.planner.HashJoin(join) + val planned = ctx.planner.EquiJoinSelection(join) assert(planned.size === 1) } def assertJoin(sqlString: String, c: Class[_]): Any = { - val df = ctx.sql(sqlString) + val df = sql(sqlString) val physical = df.queryExecution.sparkPlan val operators = physical.collect { case j: ShuffledHashJoin => j @@ -55,6 +49,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { case j: BroadcastNestedLoopJoin => j case j: BroadcastLeftSemiJoinHash => j case j: SortMergeJoin => j + case j: SortMergeOuterJoin => j } assert(operators.size === 1) @@ -66,7 +61,6 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { test("join operator selection") { ctx.cacheManager.clearCache() - val SORTMERGEJOIN_ENABLED: Boolean = ctx.conf.sortMergeJoinEnabled Seq( ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash]), ("SELECT * FROM testData LEFT SEMI JOIN testData2", classOf[LeftSemiJoinBNL]), @@ -83,11 +77,11 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[SortMergeJoin]), ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[SortMergeJoin]), ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[SortMergeJoin]), - ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[ShuffledHashOuterJoin]), + ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[SortMergeOuterJoin]), ("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2", - classOf[ShuffledHashOuterJoin]), + classOf[SortMergeOuterJoin]), ("SELECT * FROM testData right join testData2 ON key = a and key = 2", - classOf[ShuffledHashOuterJoin]), + classOf[SortMergeOuterJoin]), ("SELECT * FROM testData full outer join testData2 ON key = a", classOf[ShuffledHashOuterJoin]), ("SELECT * FROM testData left JOIN testData2 ON (key * a != key + a)", @@ -97,90 +91,83 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { ("SELECT * FROM testData full JOIN testData2 ON (key * a != key + a)", classOf[BroadcastNestedLoopJoin]) ).foreach { case (query, joinClass) => assertJoin(query, joinClass) } - try { - ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, true) + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "false") { Seq( - ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[SortMergeJoin]), - ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[SortMergeJoin]), - ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[SortMergeJoin]) + ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[ShuffledHashJoin]), + ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", + classOf[ShuffledHashJoin]), + ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", + classOf[ShuffledHashJoin]), + ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[ShuffledHashOuterJoin]), + ("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2", + classOf[ShuffledHashOuterJoin]), + ("SELECT * FROM testData right join testData2 ON key = a and key = 2", + classOf[ShuffledHashOuterJoin]), + ("SELECT * FROM testData full outer join testData2 ON key = a", + classOf[ShuffledHashOuterJoin]) ).foreach { case (query, joinClass) => assertJoin(query, joinClass) } - } finally { - ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, SORTMERGEJOIN_ENABLED) } } test("SortMergeJoin shouldn't work on unsortable columns") { - val SORTMERGEJOIN_ENABLED: Boolean = ctx.conf.sortMergeJoinEnabled - try { - ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, true) + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "true") { Seq( ("SELECT * FROM arrayData JOIN complexData ON data = a", classOf[ShuffledHashJoin]) ).foreach { case (query, joinClass) => assertJoin(query, joinClass) } - } finally { - ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, SORTMERGEJOIN_ENABLED) } } test("broadcasted hash join operator selection") { ctx.cacheManager.clearCache() - ctx.sql("CACHE TABLE testData") - - val SORTMERGEJOIN_ENABLED: Boolean = ctx.conf.sortMergeJoinEnabled - Seq( - ("SELECT * FROM testData join testData2 ON key = a", classOf[BroadcastHashJoin]), - ("SELECT * FROM testData join testData2 ON key = a and key = 2", classOf[BroadcastHashJoin]), - ("SELECT * FROM testData join testData2 ON key = a where key = 2", - classOf[BroadcastHashJoin]) - ).foreach { case (query, joinClass) => assertJoin(query, joinClass) } - try { - ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, true) - Seq( - ("SELECT * FROM testData join testData2 ON key = a", classOf[BroadcastHashJoin]), - ("SELECT * FROM testData join testData2 ON key = a and key = 2", - classOf[BroadcastHashJoin]), - ("SELECT * FROM testData join testData2 ON key = a where key = 2", - classOf[BroadcastHashJoin]) - ).foreach { case (query, joinClass) => assertJoin(query, joinClass) } - } finally { - ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, SORTMERGEJOIN_ENABLED) + sql("CACHE TABLE testData") + for (sortMergeJoinEnabled <- Seq(true, false)) { + withClue(s"sortMergeJoinEnabled=$sortMergeJoinEnabled") { + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> s"$sortMergeJoinEnabled") { + Seq( + ("SELECT * FROM testData join testData2 ON key = a", + classOf[BroadcastHashJoin]), + ("SELECT * FROM testData join testData2 ON key = a and key = 2", + classOf[BroadcastHashJoin]), + ("SELECT * FROM testData join testData2 ON key = a where key = 2", + classOf[BroadcastHashJoin]) + ).foreach { case (query, joinClass) => assertJoin(query, joinClass) } + } + } } - - ctx.sql("UNCACHE TABLE testData") + sql("UNCACHE TABLE testData") } test("broadcasted hash outer join operator selection") { ctx.cacheManager.clearCache() - ctx.sql("CACHE TABLE testData") - - val SORTMERGEJOIN_ENABLED: Boolean = ctx.conf.sortMergeJoinEnabled - Seq( - ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[ShuffledHashOuterJoin]), - ("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2", - classOf[BroadcastHashOuterJoin]), - ("SELECT * FROM testData right join testData2 ON key = a and key = 2", - classOf[BroadcastHashOuterJoin]) - ).foreach { case (query, joinClass) => assertJoin(query, joinClass) } - try { - ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, true) + sql("CACHE TABLE testData") + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "true") { Seq( - ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[ShuffledHashOuterJoin]), + ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", + classOf[SortMergeOuterJoin]), ("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2", classOf[BroadcastHashOuterJoin]), ("SELECT * FROM testData right join testData2 ON key = a and key = 2", classOf[BroadcastHashOuterJoin]) ).foreach { case (query, joinClass) => assertJoin(query, joinClass) } - } finally { - ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, SORTMERGEJOIN_ENABLED) } - - ctx.sql("UNCACHE TABLE testData") + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "false") { + Seq( + ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", + classOf[ShuffledHashOuterJoin]), + ("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2", + classOf[BroadcastHashOuterJoin]), + ("SELECT * FROM testData right join testData2 ON key = a and key = 2", + classOf[BroadcastHashOuterJoin]) + ).foreach { case (query, joinClass) => assertJoin(query, joinClass) } + } + sql("UNCACHE TABLE testData") } test("multiple-key equi-join is hash-join") { val x = testData2.as("x") val y = testData2.as("y") val join = x.join(y, ($"x.a" === $"y.a") && ($"x.b" === $"y.b")).queryExecution.optimizedPlan - val planned = ctx.planner.HashJoin(join) + val planned = ctx.planner.EquiJoinSelection(join) assert(planned.size === 1) } @@ -285,7 +272,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { // Make sure we are choosing left.outputPartitioning as the // outputPartitioning for the outer join operator. checkAnswer( - ctx.sql( + sql( """ |SELECT l.N, count(*) |FROM upperCaseData l LEFT OUTER JOIN allNulls r ON (l.N = r.a) @@ -299,7 +286,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(6, 1) :: Nil) checkAnswer( - ctx.sql( + sql( """ |SELECT r.a, count(*) |FROM upperCaseData l LEFT OUTER JOIN allNulls r ON (l.N = r.a) @@ -345,7 +332,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { // Make sure we are choosing right.outputPartitioning as the // outputPartitioning for the outer join operator. checkAnswer( - ctx.sql( + sql( """ |SELECT l.a, count(*) |FROM allNulls l RIGHT OUTER JOIN upperCaseData r ON (l.a = r.N) @@ -354,7 +341,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, 6)) checkAnswer( - ctx.sql( + sql( """ |SELECT r.N, count(*) |FROM allNulls l RIGHT OUTER JOIN upperCaseData r ON (l.a = r.N) @@ -406,7 +393,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { // Make sure we are UnknownPartitioning as the outputPartitioning for the outer join operator. checkAnswer( - ctx.sql( + sql( """ |SELECT l.a, count(*) |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N) @@ -415,7 +402,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, 10)) checkAnswer( - ctx.sql( + sql( """ |SELECT r.N, count(*) |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N) @@ -430,7 +417,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, 4) :: Nil) checkAnswer( - ctx.sql( + sql( """ |SELECT l.N, count(*) |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a) @@ -445,7 +432,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, 4) :: Nil) checkAnswer( - ctx.sql( + sql( """ |SELECT r.a, count(*) |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a) @@ -456,31 +443,30 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { test("broadcasted left semi join operator selection") { ctx.cacheManager.clearCache() - ctx.sql("CACHE TABLE testData") - val tmp = ctx.conf.autoBroadcastJoinThreshold + sql("CACHE TABLE testData") - ctx.sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=1000000000") - Seq( - ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", - classOf[BroadcastLeftSemiJoinHash]) - ).foreach { - case (query, joinClass) => assertJoin(query, joinClass) + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1000000000") { + Seq( + ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", + classOf[BroadcastLeftSemiJoinHash]) + ).foreach { + case (query, joinClass) => assertJoin(query, joinClass) + } } - ctx.sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=-1") - - Seq( - ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash]) - ).foreach { - case (query, joinClass) => assertJoin(query, joinClass) + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + Seq( + ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash]) + ).foreach { + case (query, joinClass) => assertJoin(query, joinClass) + } } - ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, tmp) - ctx.sql("UNCACHE TABLE testData") + sql("UNCACHE TABLE testData") } test("left semi join") { - val df = ctx.sql("SELECT * FROM testData2 LEFT SEMI JOIN testData ON key = a") + val df = sql("SELECT * FROM testData2 LEFT SEMI JOIN testData ON key = a") checkAnswer(df, Row(1, 1) :: Row(1, 2) :: @@ -488,6 +474,5 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil) - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala similarity index 58% rename from sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala rename to sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index 0d4e30f29255..045fea82e4c8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -17,20 +17,16 @@ package org.apache.spark.sql -import org.apache.spark.Logging -import org.apache.spark.sql.catalyst.expressions.{Expression} -import org.apache.spark.sql.execution.aggregate.ScalaUDAF -import org.apache.spark.sql.expressions.UserDefinedAggregateFunction +import org.apache.spark.sql.test.SharedSQLContext -class UDAFRegistration private[sql] (sqlContext: SQLContext) extends Logging { +class JsonFunctionsSuite extends QueryTest with SharedSQLContext { + import testImplicits._ - private val functionRegistry = sqlContext.functionRegistry - - def register( - name: String, - func: UserDefinedAggregateFunction): UserDefinedAggregateFunction = { - def builder(children: Seq[Expression]) = ScalaUDAF(children, func) - functionRegistry.registerFunction(name, builder) - func + test("function get_json_object") { + val df: DataFrame = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b") + checkAnswer( + df.selectExpr("get_json_object(a, '$.name')", "get_json_object(a, '$.age')"), + Row("alice", "5")) } + } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala index 2089660c52bf..babf8835d254 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala @@ -19,12 +19,11 @@ package org.apache.spark.sql import org.scalatest.BeforeAndAfter +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType} -class ListTablesSuite extends QueryTest with BeforeAndAfter { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContext { + import testImplicits._ private lazy val df = (1 to 10).map(i => (i, s"str$i")).toDF("key", "value") @@ -42,7 +41,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter { Row("ListTablesSuiteTable", true)) checkAnswer( - ctx.sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"), + sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"), Row("ListTablesSuiteTable", true)) ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable")) @@ -55,7 +54,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter { Row("ListTablesSuiteTable", true)) checkAnswer( - ctx.sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"), + sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"), Row("ListTablesSuiteTable", true)) ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable")) @@ -67,13 +66,13 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter { StructField("tableName", StringType, false) :: StructField("isTemporary", BooleanType, false) :: Nil) - Seq(ctx.tables(), ctx.sql("SHOW TABLes")).foreach { + Seq(ctx.tables(), sql("SHOW TABLes")).foreach { case tableDF => assert(expectedSchema === tableDF.schema) tableDF.registerTempTable("tables") checkAnswer( - ctx.sql( + sql( "SELECT isTemporary, tableName from tables WHERE tableName = 'ListTablesSuiteTable'"), Row(true, "ListTablesSuiteTable") ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala index 8cf2ef5957d8..30289c3c1d09 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala @@ -19,18 +19,16 @@ package org.apache.spark.sql import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions.{log => logarithm} +import org.apache.spark.sql.test.SharedSQLContext private object MathExpressionsTestData { case class DoubleData(a: java.lang.Double, b: java.lang.Double) case class NullDoubles(a: java.lang.Double) } -class MathExpressionsSuite extends QueryTest { - +class MathExpressionsSuite extends QueryTest with SharedSQLContext { import MathExpressionsTestData._ - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ + import testImplicits._ private lazy val doubleData = (1 to 10).map(i => DoubleData(i * 0.2 - 1, i * -0.2 + 1)).toDF() @@ -149,7 +147,7 @@ class MathExpressionsSuite extends QueryTest { test("toDegrees") { testOneToOneMathFunction(toDegrees, math.toDegrees) checkAnswer( - ctx.sql("SELECT degrees(0), degrees(1), degrees(1.5)"), + sql("SELECT degrees(0), degrees(1), degrees(1.5)"), Seq((1, 2)).toDF().select(toDegrees(lit(0)), toDegrees(lit(1)), toDegrees(lit(1.5))) ) } @@ -157,7 +155,7 @@ class MathExpressionsSuite extends QueryTest { test("toRadians") { testOneToOneMathFunction(toRadians, math.toRadians) checkAnswer( - ctx.sql("SELECT radians(0), radians(1), radians(1.5)"), + sql("SELECT radians(0), radians(1), radians(1.5)"), Seq((1, 2)).toDF().select(toRadians(lit(0)), toRadians(lit(1)), toRadians(lit(1.5))) ) } @@ -169,7 +167,7 @@ class MathExpressionsSuite extends QueryTest { test("ceil and ceiling") { testOneToOneMathFunction(ceil, math.ceil) checkAnswer( - ctx.sql("SELECT ceiling(0), ceiling(1), ceiling(1.5)"), + sql("SELECT ceiling(0), ceiling(1), ceiling(1.5)"), Row(0.0, 1.0, 2.0)) } @@ -214,7 +212,7 @@ class MathExpressionsSuite extends QueryTest { val pi = 3.1415 checkAnswer( - ctx.sql(s"SELECT round($pi, -3), round($pi, -2), round($pi, -1), " + + sql(s"SELECT round($pi, -3), round($pi, -2), round($pi, -1), " + s"round($pi, 0), round($pi, 1), round($pi, 2), round($pi, 3)"), Seq(Row(BigDecimal("0E3"), BigDecimal("0E2"), BigDecimal("0E1"), BigDecimal(3), BigDecimal("3.1"), BigDecimal("3.14"), BigDecimal("3.142"))) @@ -233,7 +231,7 @@ class MathExpressionsSuite extends QueryTest { testOneToOneMathFunction[Double](signum, math.signum) checkAnswer( - ctx.sql("SELECT sign(10), signum(-11)"), + sql("SELECT sign(10), signum(-11)"), Row(1, -1)) } @@ -241,7 +239,7 @@ class MathExpressionsSuite extends QueryTest { testTwoToOneMathFunction(pow, pow, math.pow) checkAnswer( - ctx.sql("SELECT pow(1, 2), power(2, 1)"), + sql("SELECT pow(1, 2), power(2, 1)"), Seq((1, 2)).toDF().select(pow(lit(1), lit(2)), pow(lit(2), lit(1))) ) } @@ -280,7 +278,7 @@ class MathExpressionsSuite extends QueryTest { test("log / ln") { testOneToOneNonNegativeMathFunction(org.apache.spark.sql.functions.log, math.log) checkAnswer( - ctx.sql("SELECT ln(0), ln(1), ln(1.5)"), + sql("SELECT ln(0), ln(1), ln(1.5)"), Seq((1, 2)).toDF().select(logarithm(lit(0)), logarithm(lit(1)), logarithm(lit(1.5))) ) } @@ -375,7 +373,7 @@ class MathExpressionsSuite extends QueryTest { df.select(log2("b") + log2("a")), Row(1)) - checkAnswer(ctx.sql("SELECT LOG2(8), LOG2(null)"), Row(3, null)) + checkAnswer(sql("SELECT LOG2(8), LOG2(null)"), Row(3, null)) } test("sqrt") { @@ -384,13 +382,13 @@ class MathExpressionsSuite extends QueryTest { df.select(sqrt("a"), sqrt("b")), Row(1.0, 2.0)) - checkAnswer(ctx.sql("SELECT SQRT(4.0), SQRT(null)"), Row(2.0, null)) + checkAnswer(sql("SELECT SQRT(4.0), SQRT(null)"), Row(2.0, null)) checkAnswer(df.selectExpr("sqrt(a)", "sqrt(b)", "sqrt(null)"), Row(1.0, 2.0, null)) } test("negative") { checkAnswer( - ctx.sql("SELECT negative(1), negative(0), negative(-1)"), + sql("SELECT negative(1), negative(0), negative(-1)"), Row(-1, 0, 1)) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 98ba3c99283a..4adcefb7dc4b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -71,12 +71,6 @@ class QueryTest extends PlanTest { checkAnswer(df, expectedAnswer.collect()) } - def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext) { - test(sqlString) { - checkAnswer(sqlContext.sql(sqlString), expectedAnswer) - } - } - /** * Asserts that a given [[DataFrame]] will be executed using the given number of cached results. */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala index 8a679c7865d6..77ccd6f775e5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala @@ -20,13 +20,12 @@ package org.apache.spark.sql import org.apache.spark.SparkFunSuite import org.apache.spark.sql.execution.SparkSqlSerializer import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, SpecificMutableRow} +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -class RowSuite extends SparkFunSuite { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class RowSuite extends SparkFunSuite with SharedSQLContext { + import testImplicits._ test("create row") { val expected = new GenericMutableRow(4) @@ -86,4 +85,13 @@ class RowSuite extends SparkFunSuite { val r2 = Row(Double.NaN) assert(r1 === r2) } + + test("equals and hashCode") { + val r1 = Row("Hello") + val r2 = Row("Hello") + assert(r1 === r2) + assert(r1.hashCode() === r2.hashCode()) + val r3 = Row("World") + assert(r3.hashCode() != r1.hashCode()) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala index 75791e9d53c2..7699adadd9cc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala @@ -17,11 +17,10 @@ package org.apache.spark.sql +import org.apache.spark.sql.test.SharedSQLContext -class SQLConfSuite extends QueryTest { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext +class SQLConfSuite extends QueryTest with SharedSQLContext { private val testKey = "test.key.0" private val testVal = "test.val.0" @@ -52,21 +51,21 @@ class SQLConfSuite extends QueryTest { test("parse SQL set commands") { ctx.conf.clear() - ctx.sql(s"set $testKey=$testVal") + sql(s"set $testKey=$testVal") assert(ctx.getConf(testKey, testVal + "_") === testVal) assert(ctx.getConf(testKey, testVal + "_") === testVal) - ctx.sql("set some.property=20") + sql("set some.property=20") assert(ctx.getConf("some.property", "0") === "20") - ctx.sql("set some.property = 40") + sql("set some.property = 40") assert(ctx.getConf("some.property", "0") === "40") val key = "spark.sql.key" val vs = "val0,val_1,val2.3,my_table" - ctx.sql(s"set $key=$vs") + sql(s"set $key=$vs") assert(ctx.getConf(key, "0") === vs) - ctx.sql(s"set $key=") + sql(s"set $key=") assert(ctx.getConf(key, "0") === "") ctx.conf.clear() @@ -74,14 +73,14 @@ class SQLConfSuite extends QueryTest { test("deprecated property") { ctx.conf.clear() - ctx.sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10") + sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10") assert(ctx.conf.numShufflePartitions === 10) } test("invalid conf value") { ctx.conf.clear() val e = intercept[IllegalArgumentException] { - ctx.sql(s"set ${SQLConf.CASE_SENSITIVE.key}=10") + sql(s"set ${SQLConf.CASE_SENSITIVE.key}=10") } assert(e.getMessage === s"${SQLConf.CASE_SENSITIVE.key} should be boolean, but was 10") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala index c8d8796568a4..007be1295077 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala @@ -17,16 +17,17 @@ package org.apache.spark.sql -import org.scalatest.BeforeAndAfterAll - import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.test.SharedSQLContext -class SQLContextSuite extends SparkFunSuite with BeforeAndAfterAll { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext +class SQLContextSuite extends SparkFunSuite with SharedSQLContext { override def afterAll(): Unit = { - SQLContext.setLastInstantiatedContext(ctx) + try { + SQLContext.setLastInstantiatedContext(ctx) + } finally { + super.afterAll() + } } test("getOrCreate instantiates SQLContext") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index bbadc202a4f0..9e172b2c264c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -17,30 +17,26 @@ package org.apache.spark.sql +import java.math.MathContext import java.sql.Timestamp -import org.scalatest.BeforeAndAfterAll - -import org.apache.spark.sql.catalyst.analysis.FunctionRegistry +import org.apache.spark.AccumulatorSuite import org.apache.spark.sql.catalyst.DefaultParserDialect +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.errors.DialectException import org.apache.spark.sql.execution.aggregate -import org.apache.spark.sql.execution.GeneratedAggregate import org.apache.spark.sql.functions._ -import org.apache.spark.sql.TestData._ -import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.test.SQLTestData._ +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ /** A SQL Dialect for testing purpose, and it can not be nested type */ class MyDialect extends DefaultParserDialect -class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils { - // Make sure the tables are loaded. - TestData +class SQLQuerySuite extends QueryTest with SharedSQLContext { + import testImplicits._ - val sqlContext = org.apache.spark.sql.test.TestSQLContext - import sqlContext.implicits._ - import sqlContext.sql + setupTestData() test("having clause") { Seq(("one", 1), ("two", 2), ("three", 3), ("one", 5)).toDF("k", "v").registerTempTable("hav") @@ -60,7 +56,8 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils { } test("show functions") { - checkAnswer(sql("SHOW functions"), FunctionRegistry.builtin.listFunction().sorted.map(Row(_))) + checkAnswer(sql("SHOW functions"), + FunctionRegistry.builtin.listFunction().sorted.map(Row(_))) } test("describe functions") { @@ -178,7 +175,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils { val df = Seq(Tuple1(1), Tuple1(2), Tuple1(3)).toDF("index") // we except the id is materialized once - val idUDF = udf(() => UUID.randomUUID().toString) + val idUDF = org.apache.spark.sql.functions.udf(() => UUID.randomUUID().toString) val dfWithId = df.withColumn("id", idUDF()) // Make a new DataFrame (actually the same reference to the old one) @@ -258,6 +255,23 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils { } } + private def testCodeGen(sqlText: String, expectedResults: Seq[Row]): Unit = { + val df = sql(sqlText) + // First, check if we have GeneratedAggregate. + val hasGeneratedAgg = df.queryExecution.executedPlan + .collect { case _: aggregate.TungstenAggregate => true } + .nonEmpty + if (!hasGeneratedAgg) { + fail( + s""" + |Codegen is enabled, but query $sqlText does not have TungstenAggregate in the plan. + |${df.queryExecution.simpleString} + """.stripMargin) + } + // Then, check results. + checkAnswer(df, expectedResults) + } + test("aggregation with codegen") { val originalValue = sqlContext.conf.codegenEnabled sqlContext.setConf(SQLConf.CODEGEN_ENABLED, true) @@ -267,26 +281,6 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils { .unionAll(sqlContext.table("testData")) .registerTempTable("testData3x") - def testCodeGen(sqlText: String, expectedResults: Seq[Row]): Unit = { - val df = sql(sqlText) - // First, check if we have GeneratedAggregate. - var hasGeneratedAgg = false - df.queryExecution.executedPlan.foreach { - case generatedAgg: GeneratedAggregate => hasGeneratedAgg = true - case newAggregate: aggregate.Aggregate => hasGeneratedAgg = true - case _ => - } - if (!hasGeneratedAgg) { - fail( - s""" - |Codegen is enabled, but query $sqlText does not have GeneratedAggregate in the plan. - |${df.queryExecution.simpleString} - """.stripMargin) - } - // Then, check results. - checkAnswer(df, expectedResults) - } - try { // Just to group rows. testCodeGen( @@ -715,9 +709,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils { checkAnswer( sql( - """ - |SELECT COUNT(a), COUNT(b), COUNT(1), COUNT(DISTINCT a), COUNT(DISTINCT b) FROM testData3 - """.stripMargin), + "SELECT COUNT(a), COUNT(b), COUNT(1), COUNT(DISTINCT a), COUNT(DISTINCT b) FROM testData3"), Row(2, 1, 2, 2, 1)) } @@ -1164,7 +1156,8 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils { validateMetadata(sql("SELECT * FROM personWithMeta")) validateMetadata(sql("SELECT id, name FROM personWithMeta")) validateMetadata(sql("SELECT * FROM personWithMeta JOIN salary ON id = personId")) - validateMetadata(sql("SELECT name, salary FROM personWithMeta JOIN salary ON id = personId")) + validateMetadata(sql( + "SELECT name, salary FROM personWithMeta JOIN salary ON id = personId")) } test("SPARK-3371 Renaming a function expression with group by gives error") { @@ -1604,4 +1597,124 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils { checkAnswer(df.select(-df("i")), Row(new CalendarInterval(-(12 * 3 - 3), -(7L * MICROS_PER_WEEK + 123)))) } + + test("aggregation with codegen updates peak execution memory") { + withSQLConf((SQLConf.CODEGEN_ENABLED.key, "true")) { + val sc = sqlContext.sparkContext + AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "aggregation with codegen") { + testCodeGen( + "SELECT key, count(value) FROM testData GROUP BY key", + (1 to 100).map(i => Row(i, 1))) + } + } + } + + test("decimal precision with multiply/division") { + checkAnswer(sql("select 10.3 * 3.0"), Row(BigDecimal("30.90"))) + checkAnswer(sql("select 10.3000 * 3.0"), Row(BigDecimal("30.90000"))) + checkAnswer(sql("select 10.30000 * 30.0"), Row(BigDecimal("309.000000"))) + checkAnswer(sql("select 10.300000000000000000 * 3.000000000000000000"), + Row(BigDecimal("30.900000000000000000000000000000000000", new MathContext(38)))) + checkAnswer(sql("select 10.300000000000000000 * 3.0000000000000000000"), + Row(null)) + + checkAnswer(sql("select 10.3 / 3.0"), Row(BigDecimal("3.433333"))) + checkAnswer(sql("select 10.3000 / 3.0"), Row(BigDecimal("3.4333333"))) + checkAnswer(sql("select 10.30000 / 30.0"), Row(BigDecimal("0.343333333"))) + checkAnswer(sql("select 10.300000000000000000 / 3.00000000000000000"), + Row(BigDecimal("3.433333333333333333333333333", new MathContext(38)))) + checkAnswer(sql("select 10.3000000000000000000 / 3.00000000000000000"), + Row(BigDecimal("3.4333333333333333333333333333", new MathContext(38)))) + } + + test("SPARK-10215 Div of Decimal returns null") { + val d = Decimal(1.12321) + val df = Seq((d, 1)).toDF("a", "b") + + checkAnswer( + df.selectExpr("b * a / b"), + Seq(Row(d.toBigDecimal))) + checkAnswer( + df.selectExpr("b * a / b / b"), + Seq(Row(d.toBigDecimal))) + checkAnswer( + df.selectExpr("b * a + b"), + Seq(Row(BigDecimal(2.12321)))) + checkAnswer( + df.selectExpr("b * a - b"), + Seq(Row(BigDecimal(0.12321)))) + checkAnswer( + df.selectExpr("b * a * b"), + Seq(Row(d.toBigDecimal))) + } + + test("precision smaller than scale") { + checkAnswer(sql("select 10.00"), Row(BigDecimal("10.00"))) + checkAnswer(sql("select 1.00"), Row(BigDecimal("1.00"))) + checkAnswer(sql("select 0.10"), Row(BigDecimal("0.10"))) + checkAnswer(sql("select 0.01"), Row(BigDecimal("0.01"))) + checkAnswer(sql("select 0.001"), Row(BigDecimal("0.001"))) + checkAnswer(sql("select -0.01"), Row(BigDecimal("-0.01"))) + checkAnswer(sql("select -0.001"), Row(BigDecimal("-0.001"))) + } + + test("external sorting updates peak execution memory") { + withSQLConf((SQLConf.EXTERNAL_SORT.key, "true")) { + val sc = sqlContext.sparkContext + AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external sort") { + sortTest() + } + } + } + + test("SPARK-9511: error with table starting with number") { + withTempTable("1one") { + sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)) + .toDF("num", "str") + .registerTempTable("1one") + checkAnswer(sql("select count(num) from 1one"), Row(10)) + } + } + + test("specifying database name for a temporary table is not allowed") { + withTempPath { dir => + val path = dir.getCanonicalPath + val df = + sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str") + df + .write + .format("parquet") + .save(path) + + val message = intercept[AnalysisException] { + sqlContext.sql( + s""" + |CREATE TEMPORARY TABLE db.t + |USING parquet + |OPTIONS ( + | path '$path' + |) + """.stripMargin) + }.getMessage + assert(message.contains("Specifying database name or other qualifiers are not allowed")) + + // If you use backticks to quote the name of a temporary table having dot in it. + sqlContext.sql( + s""" + |CREATE TEMPORARY TABLE `db.t` + |USING parquet + |OPTIONS ( + | path '$path' + |) + """.stripMargin) + checkAnswer(sqlContext.table("`db.t`"), df) + } + } + + test("SPARK-10130 type coercion for IF should have children resolved first") { + val df = Seq((1, 1), (-1, 1)).toDF("key", "value") + df.registerTempTable("src") + checkAnswer( + sql("SELECT IF(a > 0, a, 0) FROM (SELECT key a FROM src) temp"), Seq(Row(1), Row(0))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala index ab6d3dd96d27..295f02f9a7b5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql import java.sql.{Date, Timestamp} import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.test.SharedSQLContext case class ReflectData( stringField: String, @@ -71,17 +72,15 @@ case class ComplexReflectData( mapFieldContainsNull: Map[Int, Option[Long]], dataField: Data) -class ScalaReflectionRelationSuite extends SparkFunSuite { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class ScalaReflectionRelationSuite extends SparkFunSuite with SharedSQLContext { + import testImplicits._ test("query case class RDD") { val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true, new java.math.BigDecimal(1), Date.valueOf("1970-01-01"), new Timestamp(12345), Seq(1, 2, 3)) Seq(data).toDF().registerTempTable("reflectData") - assert(ctx.sql("SELECT * FROM reflectData").collect().head === + assert(sql("SELECT * FROM reflectData").collect().head === Row("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true, new java.math.BigDecimal(1), Date.valueOf("1970-01-01"), new Timestamp(12345), Seq(1, 2, 3))) @@ -91,7 +90,7 @@ class ScalaReflectionRelationSuite extends SparkFunSuite { val data = NullReflectData(null, null, null, null, null, null, null) Seq(data).toDF().registerTempTable("reflectNullData") - assert(ctx.sql("SELECT * FROM reflectNullData").collect().head === + assert(sql("SELECT * FROM reflectNullData").collect().head === Row.fromSeq(Seq.fill(7)(null))) } @@ -99,7 +98,7 @@ class ScalaReflectionRelationSuite extends SparkFunSuite { val data = OptionalReflectData(None, None, None, None, None, None, None) Seq(data).toDF().registerTempTable("reflectOptionalData") - assert(ctx.sql("SELECT * FROM reflectOptionalData").collect().head === + assert(sql("SELECT * FROM reflectOptionalData").collect().head === Row.fromSeq(Seq.fill(7)(null))) } @@ -107,7 +106,7 @@ class ScalaReflectionRelationSuite extends SparkFunSuite { test("query binary data") { Seq(ReflectBinary(Array[Byte](1))).toDF().registerTempTable("reflectBinary") - val result = ctx.sql("SELECT data FROM reflectBinary") + val result = sql("SELECT data FROM reflectBinary") .collect().head(0).asInstanceOf[Array[Byte]] assert(result.toSeq === Seq[Byte](1)) } @@ -126,7 +125,7 @@ class ScalaReflectionRelationSuite extends SparkFunSuite { Nested(None, "abc"))) Seq(data).toDF().registerTempTable("reflectComplexData") - assert(ctx.sql("SELECT * FROM reflectComplexData").collect().head === + assert(sql("SELECT * FROM reflectComplexData").collect().head === Row( Seq(1, 2, 3), Seq(1, 2, null), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala index e55c9e460b79..45d0ee4a8e74 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala @@ -19,13 +19,12 @@ package org.apache.spark.sql import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.serializer.JavaSerializer +import org.apache.spark.sql.test.SharedSQLContext -class SerializationSuite extends SparkFunSuite { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext +class SerializationSuite extends SparkFunSuite with SharedSQLContext { test("[SPARK-5235] SQLContext should be serializable") { - val sqlContext = new SQLContext(ctx.sparkContext) - new JavaSerializer(new SparkConf()).newInstance().serialize(sqlContext) + val _sqlContext = new SQLContext(sqlContext.sparkContext) + new JavaSerializer(new SparkConf()).newInstance().serialize(_sqlContext) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index ab5da6ee79f1..b91438baea06 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -18,13 +18,12 @@ package org.apache.spark.sql import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.Decimal -class StringFunctionsSuite extends QueryTest { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class StringFunctionsSuite extends QueryTest with SharedSQLContext { + import testImplicits._ test("string concat") { val df = Seq[(String, String, String)](("a", "b", null)).toDF("a", "b", "c") @@ -128,6 +127,12 @@ class StringFunctionsSuite extends QueryTest { // scalastyle:on } + test("string translate") { + val df = Seq(("translate", "")).toDF("a", "b") + checkAnswer(df.select(translate($"a", "rnlt", "123")), Row("1a2s3ae")) + checkAnswer(df.selectExpr("""translate(a, "rnlt", "")"""), Row("asae")) + } + test("string trim functions") { val df = Seq((" example ", "")).toDF("a", "b") @@ -343,9 +348,9 @@ class StringFunctionsSuite extends QueryTest { // it will still use the interpretProjection if projection follows by a LocalRelation, // hence we add a filter operator. // See the optimizer rule `ConvertToLocalRelation` - val df2 = Seq((5L, 4), (4L, 3), (3L, 2)).toDF("a", "b") + val df2 = Seq((5L, 4), (4L, 3), (4L, 3), (4L, 3), (3L, 2)).toDF("a", "b") checkAnswer( df2.filter("b>0").selectExpr("format_number(a, b)"), - Row("5.0000") :: Row("4.000") :: Row("3.00") :: Nil) + Row("5.0000") :: Row("4.000") :: Row("4.000") :: Row("4.000") :: Row("3.00") :: Nil) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala deleted file mode 100644 index bd9729c431f3..000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql - -import org.apache.spark.sql.test.TestSQLContext.implicits._ -import org.apache.spark.sql.test._ - - -case class TestData(key: Int, value: String) - -object TestData { - val testData = TestSQLContext.sparkContext.parallelize( - (1 to 100).map(i => TestData(i, i.toString))).toDF() - testData.registerTempTable("testData") - - val negativeData = TestSQLContext.sparkContext.parallelize( - (1 to 100).map(i => TestData(-i, (-i).toString))).toDF() - negativeData.registerTempTable("negativeData") - - case class LargeAndSmallInts(a: Int, b: Int) - val largeAndSmallInts = - TestSQLContext.sparkContext.parallelize( - LargeAndSmallInts(2147483644, 1) :: - LargeAndSmallInts(1, 2) :: - LargeAndSmallInts(2147483645, 1) :: - LargeAndSmallInts(2, 2) :: - LargeAndSmallInts(2147483646, 1) :: - LargeAndSmallInts(3, 2) :: Nil).toDF() - largeAndSmallInts.registerTempTable("largeAndSmallInts") - - case class TestData2(a: Int, b: Int) - val testData2 = - TestSQLContext.sparkContext.parallelize( - TestData2(1, 1) :: - TestData2(1, 2) :: - TestData2(2, 1) :: - TestData2(2, 2) :: - TestData2(3, 1) :: - TestData2(3, 2) :: Nil, 2).toDF() - testData2.registerTempTable("testData2") - - case class DecimalData(a: BigDecimal, b: BigDecimal) - - val decimalData = - TestSQLContext.sparkContext.parallelize( - DecimalData(1, 1) :: - DecimalData(1, 2) :: - DecimalData(2, 1) :: - DecimalData(2, 2) :: - DecimalData(3, 1) :: - DecimalData(3, 2) :: Nil).toDF() - decimalData.registerTempTable("decimalData") - - case class BinaryData(a: Array[Byte], b: Int) - val binaryData = - TestSQLContext.sparkContext.parallelize( - BinaryData("12".getBytes(), 1) :: - BinaryData("22".getBytes(), 5) :: - BinaryData("122".getBytes(), 3) :: - BinaryData("121".getBytes(), 2) :: - BinaryData("123".getBytes(), 4) :: Nil).toDF() - binaryData.registerTempTable("binaryData") - - case class TestData3(a: Int, b: Option[Int]) - val testData3 = - TestSQLContext.sparkContext.parallelize( - TestData3(1, None) :: - TestData3(2, Some(2)) :: Nil).toDF() - testData3.registerTempTable("testData3") - - case class UpperCaseData(N: Int, L: String) - val upperCaseData = - TestSQLContext.sparkContext.parallelize( - UpperCaseData(1, "A") :: - UpperCaseData(2, "B") :: - UpperCaseData(3, "C") :: - UpperCaseData(4, "D") :: - UpperCaseData(5, "E") :: - UpperCaseData(6, "F") :: Nil).toDF() - upperCaseData.registerTempTable("upperCaseData") - - case class LowerCaseData(n: Int, l: String) - val lowerCaseData = - TestSQLContext.sparkContext.parallelize( - LowerCaseData(1, "a") :: - LowerCaseData(2, "b") :: - LowerCaseData(3, "c") :: - LowerCaseData(4, "d") :: Nil).toDF() - lowerCaseData.registerTempTable("lowerCaseData") - - case class ArrayData(data: Seq[Int], nestedData: Seq[Seq[Int]]) - val arrayData = - TestSQLContext.sparkContext.parallelize( - ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3))) :: - ArrayData(Seq(2, 3, 4), Seq(Seq(2, 3, 4))) :: Nil) - arrayData.toDF().registerTempTable("arrayData") - - case class MapData(data: scala.collection.Map[Int, String]) - val mapData = - TestSQLContext.sparkContext.parallelize( - MapData(Map(1 -> "a1", 2 -> "b1", 3 -> "c1", 4 -> "d1", 5 -> "e1")) :: - MapData(Map(1 -> "a2", 2 -> "b2", 3 -> "c2", 4 -> "d2")) :: - MapData(Map(1 -> "a3", 2 -> "b3", 3 -> "c3")) :: - MapData(Map(1 -> "a4", 2 -> "b4")) :: - MapData(Map(1 -> "a5")) :: Nil) - mapData.toDF().registerTempTable("mapData") - - case class StringData(s: String) - val repeatedData = - TestSQLContext.sparkContext.parallelize(List.fill(2)(StringData("test"))) - repeatedData.toDF().registerTempTable("repeatedData") - - val nullableRepeatedData = - TestSQLContext.sparkContext.parallelize( - List.fill(2)(StringData(null)) ++ - List.fill(2)(StringData("test"))) - nullableRepeatedData.toDF().registerTempTable("nullableRepeatedData") - - case class NullInts(a: Integer) - val nullInts = - TestSQLContext.sparkContext.parallelize( - NullInts(1) :: - NullInts(2) :: - NullInts(3) :: - NullInts(null) :: Nil - ).toDF() - nullInts.registerTempTable("nullInts") - - val allNulls = - TestSQLContext.sparkContext.parallelize( - NullInts(null) :: - NullInts(null) :: - NullInts(null) :: - NullInts(null) :: Nil).toDF() - allNulls.registerTempTable("allNulls") - - case class NullStrings(n: Int, s: String) - val nullStrings = - TestSQLContext.sparkContext.parallelize( - NullStrings(1, "abc") :: - NullStrings(2, "ABC") :: - NullStrings(3, null) :: Nil).toDF() - nullStrings.registerTempTable("nullStrings") - - case class TableName(tableName: String) - TestSQLContext - .sparkContext - .parallelize(TableName("test") :: Nil) - .toDF() - .registerTempTable("tableName") - - val unparsedStrings = - TestSQLContext.sparkContext.parallelize( - "1, A1, true, null" :: - "2, B2, false, null" :: - "3, C3, true, null" :: - "4, D4, true, 2147483644" :: Nil) - - case class IntField(i: Int) - // An RDD with 4 elements and 8 partitions - val withEmptyParts = TestSQLContext.sparkContext.parallelize((1 to 4).map(IntField), 8) - withEmptyParts.toDF().registerTempTable("withEmptyParts") - - case class Person(id: Int, name: String, age: Int) - case class Salary(personId: Int, salary: Double) - val person = TestSQLContext.sparkContext.parallelize( - Person(0, "mike", 30) :: - Person(1, "jim", 20) :: Nil).toDF() - person.registerTempTable("person") - val salary = TestSQLContext.sparkContext.parallelize( - Salary(0, 2000.0) :: - Salary(1, 1000.0) :: Nil).toDF() - salary.registerTempTable("salary") - - case class ComplexData(m: Map[String, Int], s: TestData, a: Seq[Int], b: Boolean) - val complexData = - TestSQLContext.sparkContext.parallelize( - ComplexData(Map("1" -> 1), TestData(1, "1"), Seq(1, 1, 1), true) - :: ComplexData(Map("2" -> 2), TestData(2, "2"), Seq(2, 2, 2), false) - :: Nil).toDF() - complexData.registerTempTable("complexData") -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index 183dc3407b3a..eb275af101e2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -17,16 +17,13 @@ package org.apache.spark.sql -import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.test.SQLTestData._ -case class FunctionResult(f1: String, f2: String) +private case class FunctionResult(f1: String, f2: String) -class UDFSuite extends QueryTest with SQLTestUtils { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ - - override def sqlContext(): SQLContext = ctx +class UDFSuite extends QueryTest with SharedSQLContext { + import testImplicits._ test("built-in fixed arity expressions") { val df = ctx.emptyDataFrame @@ -57,7 +54,7 @@ class UDFSuite extends QueryTest with SQLTestUtils { test("SPARK-8003 spark_partition_id") { val df = Seq((1, "Tearing down the walls that divide us")).toDF("id", "saying") df.registerTempTable("tmp_table") - checkAnswer(ctx.sql("select spark_partition_id() from tmp_table").toDF(), Row(0)) + checkAnswer(sql("select spark_partition_id() from tmp_table").toDF(), Row(0)) ctx.dropTempTable("tmp_table") } @@ -66,9 +63,9 @@ class UDFSuite extends QueryTest with SQLTestUtils { val data = ctx.sparkContext.parallelize(0 to 10, 2).toDF("id") data.write.parquet(dir.getCanonicalPath) ctx.read.parquet(dir.getCanonicalPath).registerTempTable("test_table") - val answer = ctx.sql("select input_file_name() from test_table").head().getString(0) + val answer = sql("select input_file_name() from test_table").head().getString(0) assert(answer.contains(dir.getCanonicalPath)) - assert(ctx.sql("select input_file_name() from test_table").distinct().collect().length >= 2) + assert(sql("select input_file_name() from test_table").distinct().collect().length >= 2) ctx.dropTempTable("test_table") } } @@ -91,17 +88,17 @@ class UDFSuite extends QueryTest with SQLTestUtils { test("Simple UDF") { ctx.udf.register("strLenScala", (_: String).length) - assert(ctx.sql("SELECT strLenScala('test')").head().getInt(0) === 4) + assert(sql("SELECT strLenScala('test')").head().getInt(0) === 4) } test("ZeroArgument UDF") { ctx.udf.register("random0", () => { Math.random()}) - assert(ctx.sql("SELECT random0()").head().getDouble(0) >= 0.0) + assert(sql("SELECT random0()").head().getDouble(0) >= 0.0) } test("TwoArgument UDF") { ctx.udf.register("strLenScala", (_: String).length + (_: Int)) - assert(ctx.sql("SELECT strLenScala('test', 1)").head().getInt(0) === 5) + assert(sql("SELECT strLenScala('test', 1)").head().getInt(0) === 5) } test("UDF in a WHERE") { @@ -112,7 +109,7 @@ class UDFSuite extends QueryTest with SQLTestUtils { df.registerTempTable("integerData") val result = - ctx.sql("SELECT * FROM integerData WHERE oneArgFilter(key)") + sql("SELECT * FROM integerData WHERE oneArgFilter(key)") assert(result.count() === 20) } @@ -124,7 +121,7 @@ class UDFSuite extends QueryTest with SQLTestUtils { df.registerTempTable("groupData") val result = - ctx.sql( + sql( """ | SELECT g, SUM(v) as s | FROM groupData @@ -143,7 +140,7 @@ class UDFSuite extends QueryTest with SQLTestUtils { df.registerTempTable("groupData") val result = - ctx.sql( + sql( """ | SELECT SUM(v) | FROM groupData @@ -163,7 +160,7 @@ class UDFSuite extends QueryTest with SQLTestUtils { df.registerTempTable("groupData") val result = - ctx.sql( + sql( """ | SELECT timesHundred(SUM(v)) as v100 | FROM groupData @@ -178,7 +175,7 @@ class UDFSuite extends QueryTest with SQLTestUtils { ctx.udf.register("returnStruct", (f1: String, f2: String) => FunctionResult(f1, f2)) val result = - ctx.sql("SELECT returnStruct('test', 'test2') as ret") + sql("SELECT returnStruct('test', 'test2') as ret") .select($"ret.f1").head().getString(0) assert(result === "test") } @@ -186,12 +183,12 @@ class UDFSuite extends QueryTest with SQLTestUtils { test("udf that is transformed") { ctx.udf.register("makeStruct", (x: Int, y: Int) => (x, y)) // 1 + 1 is constant folded causing a transformation. - assert(ctx.sql("SELECT makeStruct(1 + 1, 2)").first().getAs[Row](0) === Row(2, 2)) + assert(sql("SELECT makeStruct(1 + 1, 2)").first().getAs[Row](0) === Row(2, 2)) } test("type coercion for udf inputs") { ctx.udf.register("intExpected", (x: Int) => x) // pass a decimal to intExpected. - assert(ctx.sql("SELECT intExpected(1.0)").head().getInt(0) === 1) + assert(sql("SELECT intExpected(1.0)").head().getInt(0) === 1) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala index c5faaa663e74..219435dff5bc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala @@ -23,11 +23,21 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection} import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.PlatformDependent +import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.memory.MemoryAllocator import org.apache.spark.unsafe.types.UTF8String class UnsafeRowSuite extends SparkFunSuite { + + test("bitset width calculation") { + assert(UnsafeRow.calculateBitSetWidthInBytes(0) === 0) + assert(UnsafeRow.calculateBitSetWidthInBytes(1) === 8) + assert(UnsafeRow.calculateBitSetWidthInBytes(32) === 8) + assert(UnsafeRow.calculateBitSetWidthInBytes(64) === 8) + assert(UnsafeRow.calculateBitSetWidthInBytes(65) === 16) + assert(UnsafeRow.calculateBitSetWidthInBytes(128) === 16) + } + test("writeToStream") { val row = InternalRow.apply(UTF8String.fromString("hello"), UTF8String.fromString("world"), 123) val arrayBackedUnsafeRow: UnsafeRow = @@ -41,7 +51,7 @@ class UnsafeRowSuite extends SparkFunSuite { val bytesFromOffheapRow: Array[Byte] = { val offheapRowPage = MemoryAllocator.UNSAFE.allocate(arrayBackedUnsafeRow.getSizeInBytes) try { - PlatformDependent.copyMemory( + Platform.copyMemory( arrayBackedUnsafeRow.getBaseObject, arrayBackedUnsafeRow.getBaseOffset, offheapRowPage.getBaseObject, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala index f29935224e5b..b6d279ae4726 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala @@ -24,6 +24,7 @@ import com.clearspring.analytics.stream.cardinality.HyperLogLog import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{OpenHashSetUDT, HyperLogLogUDT} import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.util.Utils import org.apache.spark.util.collection.OpenHashSet @@ -66,10 +67,8 @@ private[sql] class MyDenseVectorUDT extends UserDefinedType[MyDenseVector] { private[spark] override def asNullable: MyDenseVectorUDT = this } -class UserDefinedTypeSuite extends QueryTest { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class UserDefinedTypeSuite extends QueryTest with SharedSQLContext { + import testImplicits._ private lazy val pointsRDD = Seq( MyLabeledPoint(1.0, new MyDenseVector(Array(0.1, 1.0))), @@ -94,7 +93,7 @@ class UserDefinedTypeSuite extends QueryTest { ctx.udf.register("testType", (d: MyDenseVector) => d.isInstanceOf[MyDenseVector]) pointsRDD.registerTempTable("points") checkAnswer( - ctx.sql("SELECT testType(features) from points"), + sql("SELECT testType(features) from points"), Seq(Row(true), Row(true))) } @@ -138,4 +137,24 @@ class UserDefinedTypeSuite extends QueryTest { val actual = openHashSetUDT.deserialize(openHashSetUDT.serialize(set)) assert(actual.iterator.toSet === set.iterator.toSet) } + + test("UDTs with JSON") { + val data = Seq( + "{\"id\":1,\"vec\":[1.1,2.2,3.3,4.4]}", + "{\"id\":2,\"vec\":[2.25,4.5,8.75]}" + ) + val schema = StructType(Seq( + StructField("id", IntegerType, false), + StructField("vec", new MyDenseVectorUDT, false) + )) + + val stringRDD = ctx.sparkContext.parallelize(data) + val jsonRDD = ctx.read.schema(schema).json(stringRDD) + checkAnswer( + jsonRDD, + Row(1, new MyDenseVector(Array(1.1, 2.2, 3.3, 4.4))) :: + Row(2, new MyDenseVector(Array(2.25, 4.5, 8.75))) :: + Nil + ) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala index 66014ddca059..d0430d2a60e7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala @@ -19,33 +19,36 @@ package org.apache.spark.sql.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types._ class ColumnStatsSuite extends SparkFunSuite { - testColumnStats(classOf[BooleanColumnStats], BOOLEAN, InternalRow(true, false, 0)) - testColumnStats(classOf[ByteColumnStats], BYTE, InternalRow(Byte.MaxValue, Byte.MinValue, 0)) - testColumnStats(classOf[ShortColumnStats], SHORT, InternalRow(Short.MaxValue, Short.MinValue, 0)) - testColumnStats(classOf[IntColumnStats], INT, InternalRow(Int.MaxValue, Int.MinValue, 0)) - testColumnStats(classOf[DateColumnStats], DATE, InternalRow(Int.MaxValue, Int.MinValue, 0)) - testColumnStats(classOf[LongColumnStats], LONG, InternalRow(Long.MaxValue, Long.MinValue, 0)) + testColumnStats(classOf[BooleanColumnStats], BOOLEAN, createRow(true, false, 0)) + testColumnStats(classOf[ByteColumnStats], BYTE, createRow(Byte.MaxValue, Byte.MinValue, 0)) + testColumnStats(classOf[ShortColumnStats], SHORT, createRow(Short.MaxValue, Short.MinValue, 0)) + testColumnStats(classOf[IntColumnStats], INT, createRow(Int.MaxValue, Int.MinValue, 0)) + testColumnStats(classOf[DateColumnStats], DATE, createRow(Int.MaxValue, Int.MinValue, 0)) + testColumnStats(classOf[LongColumnStats], LONG, createRow(Long.MaxValue, Long.MinValue, 0)) testColumnStats(classOf[TimestampColumnStats], TIMESTAMP, - InternalRow(Long.MaxValue, Long.MinValue, 0)) - testColumnStats(classOf[FloatColumnStats], FLOAT, InternalRow(Float.MaxValue, Float.MinValue, 0)) + createRow(Long.MaxValue, Long.MinValue, 0)) + testColumnStats(classOf[FloatColumnStats], FLOAT, createRow(Float.MaxValue, Float.MinValue, 0)) testColumnStats(classOf[DoubleColumnStats], DOUBLE, - InternalRow(Double.MaxValue, Double.MinValue, 0)) - testColumnStats(classOf[StringColumnStats], STRING, InternalRow(null, null, 0)) - testDecimalColumnStats(InternalRow(null, null, 0)) + createRow(Double.MaxValue, Double.MinValue, 0)) + testColumnStats(classOf[StringColumnStats], STRING, createRow(null, null, 0)) + testDecimalColumnStats(createRow(null, null, 0)) + + def createRow(values: Any*): GenericInternalRow = new GenericInternalRow(values.toArray) def testColumnStats[T <: AtomicType, U <: ColumnStats]( columnStatsClass: Class[U], columnType: NativeColumnType[T], - initialStatistics: InternalRow): Unit = { + initialStatistics: GenericInternalRow): Unit = { val columnStatsName = columnStatsClass.getSimpleName test(s"$columnStatsName: empty") { val columnStats = columnStatsClass.newInstance() - columnStats.collectedStatistics.toSeq.zip(initialStatistics.toSeq).foreach { + columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } @@ -61,11 +64,11 @@ class ColumnStatsSuite extends SparkFunSuite { val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics - assertResult(values.min(ordering), "Wrong lower bound")(stats.genericGet(0)) - assertResult(values.max(ordering), "Wrong upper bound")(stats.genericGet(1)) - assertResult(10, "Wrong null count")(stats.genericGet(2)) - assertResult(20, "Wrong row count")(stats.genericGet(3)) - assertResult(stats.genericGet(4), "Wrong size in bytes") { + assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) + assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) + assertResult(10, "Wrong null count")(stats.values(2)) + assertResult(20, "Wrong row count")(stats.values(3)) + assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum @@ -73,14 +76,15 @@ class ColumnStatsSuite extends SparkFunSuite { } } - def testDecimalColumnStats[T <: AtomicType, U <: ColumnStats](initialStatistics: InternalRow) { + def testDecimalColumnStats[T <: AtomicType, U <: ColumnStats]( + initialStatistics: GenericInternalRow): Unit = { val columnStatsName = classOf[FixedDecimalColumnStats].getSimpleName val columnType = FIXED_DECIMAL(15, 10) test(s"$columnStatsName: empty") { val columnStats = new FixedDecimalColumnStats(15, 10) - columnStats.collectedStatistics.toSeq.zip(initialStatistics.toSeq).foreach { + columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } @@ -96,11 +100,11 @@ class ColumnStatsSuite extends SparkFunSuite { val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics - assertResult(values.min(ordering), "Wrong lower bound")(stats.genericGet(0)) - assertResult(values.max(ordering), "Wrong upper bound")(stats.genericGet(1)) - assertResult(10, "Wrong null count")(stats.genericGet(2)) - assertResult(20, "Wrong row count")(stats.genericGet(3)) - assertResult(stats.genericGet(4), "Wrong size in bytes") { + assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) + assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) + assertResult(10, "Wrong null count")(stats.values(2)) + assertResult(20, "Wrong row count")(stats.values(3)) + assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala index 037e2048a863..83db9b6510b3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala @@ -19,18 +19,16 @@ package org.apache.spark.sql.columnar import java.sql.{Date, Timestamp} -import org.apache.spark.sql.TestData._ +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.test.SQLTestData._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.{QueryTest, Row, TestData} import org.apache.spark.storage.StorageLevel.MEMORY_ONLY -class InMemoryColumnarQuerySuite extends QueryTest { - // Make sure the tables are loaded. - TestData +class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { + import testImplicits._ - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ - import ctx.{logicalPlanToSparkQuery, sql} + setupTestData() test("simple columnar query") { val plan = ctx.executePlan(testData.logicalPlan).executedPlan @@ -148,7 +146,7 @@ class InMemoryColumnarQuerySuite extends QueryTest { val dataTypes = Seq(StringType, BinaryType, NullType, BooleanType, ByteType, ShortType, IntegerType, LongType, - FloatType, DoubleType, DecimalType.SYSTEM_DEFAULT, DecimalType(6, 5), + FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5), DateType, TimestampType, ArrayType(IntegerType), MapType(StringType, LongType), struct) val fields = dataTypes.zipWithIndex.map { case (dataType, index) => @@ -193,4 +191,24 @@ class InMemoryColumnarQuerySuite extends QueryTest { ctx.table("InMemoryCache_different_data_types").collect()) ctx.dropTempTable("InMemoryCache_different_data_types") } + + test("SPARK-10422: String column in InMemoryColumnarCache needs to override clone method") { + val df = + ctx.range(1, 100).selectExpr("id % 10 as id").rdd.map(id => Tuple1(s"str_$id")).toDF("i") + val cached = df.cache() + // count triggers the caching action. It should not throw. + cached.count() + + // Make sure, the DataFrame is indeed cached. + assert(sqlContext.cacheManager.lookupCachedData(cached).nonEmpty) + + // Check result. + checkAnswer( + cached, + ctx.range(1, 100).selectExpr("id % 10 as id").rdd.map(id => Tuple1(s"str_$id")).toDF("i") + ) + + // Drop the cache. + cached.unpersist() + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala index 2c0879927a12..ab2644eb4581 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala @@ -17,20 +17,19 @@ package org.apache.spark.sql.columnar -import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll} - import org.apache.spark.SparkFunSuite import org.apache.spark.sql._ +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.test.SQLTestData._ -class PartitionBatchPruningSuite extends SparkFunSuite with BeforeAndAfterAll with BeforeAndAfter { - - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ +class PartitionBatchPruningSuite extends SparkFunSuite with SharedSQLContext { + import testImplicits._ private lazy val originalColumnBatchSize = ctx.conf.columnBatchSize private lazy val originalInMemoryPartitionPruning = ctx.conf.inMemoryPartitionPruning override protected def beforeAll(): Unit = { + super.beforeAll() // Make a table with 5 partitions, 2 batches per partition, 10 elements per batch ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, 10) @@ -44,19 +43,17 @@ class PartitionBatchPruningSuite extends SparkFunSuite with BeforeAndAfterAll wi ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true) // Enable in-memory table scan accumulators ctx.setConf("spark.sql.inMemoryTableScanStatistics.enable", "true") - } - - override protected def afterAll(): Unit = { - ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize) - ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) - } - - before { ctx.cacheTable("pruningData") } - after { - ctx.uncacheTable("pruningData") + override protected def afterAll(): Unit = { + try { + ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize) + ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) + ctx.uncacheTable("pruningData") + } finally { + super.afterAll() + } } // Comparisons @@ -110,7 +107,7 @@ class PartitionBatchPruningSuite extends SparkFunSuite with BeforeAndAfterAll wi expectedQueryResult: => Seq[Int]): Unit = { test(query) { - val df = ctx.sql(query) + val df = sql(query) val queryExecution = df.queryExecution assertResult(expectedQueryResult.toArray, s"Wrong query result: $queryExecution") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala deleted file mode 100644 index 20def6bef0c1..000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution - -import org.apache.spark.sql.SQLConf -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.test.TestSQLContext - -class AggregateSuite extends SparkPlanTest { - - test("SPARK-8357 unsafe aggregation path should not leak memory with empty input") { - val codegenDefault = TestSQLContext.getConf(SQLConf.CODEGEN_ENABLED) - val unsafeDefault = TestSQLContext.getConf(SQLConf.UNSAFE_ENABLED) - try { - TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, true) - TestSQLContext.setConf(SQLConf.UNSAFE_ENABLED, true) - val df = Seq.empty[(Int, Int)].toDF("a", "b") - checkAnswer( - df, - GeneratedAggregate( - partial = true, - Seq(df.col("b").expr), - Seq(Alias(Count(df.col("a").expr), "cnt")()), - unsafeEnabled = true, - _: SparkPlan), - Seq.empty - ) - } finally { - TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, codegenDefault) - TestSQLContext.setConf(SQLConf.UNSAFE_ENABLED, unsafeDefault) - } - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala index 79e903c2bbd4..8998f5111124 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala @@ -19,8 +19,9 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.plans.physical.SinglePartition +import org.apache.spark.sql.test.SharedSQLContext -class ExchangeSuite extends SparkPlanTest { +class ExchangeSuite extends SparkPlanTest with SharedSQLContext { test("shuffling UnsafeRows in exchange") { val input = (1 to 1000).map(Tuple1.apply) checkAnswer( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 18b0e54dc7c5..fad93b014c23 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -18,24 +18,27 @@ package org.apache.spark.sql.execution import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.TestData._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{execution, Row, SQLConf} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, ShuffledHashJoin} import org.apache.spark.sql.functions._ -import org.apache.spark.sql.test.{SQLTestUtils, TestSQLContext} -import org.apache.spark.sql.test.TestSQLContext._ -import org.apache.spark.sql.test.TestSQLContext.implicits._ -import org.apache.spark.sql.test.TestSQLContext.planner._ +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ -import org.apache.spark.sql.{SQLContext, Row, SQLConf, execution} -class PlannerSuite extends SparkFunSuite with SQLTestUtils { +class PlannerSuite extends SparkFunSuite with SharedSQLContext { + import testImplicits._ - override def sqlContext: SQLContext = TestSQLContext + setupTestData() private def testPartialAggregationPlan(query: LogicalPlan): Unit = { + val _ctx = ctx + import _ctx.planner._ val plannedOption = HashAggregation(query).headOption.orElse(Aggregation(query).headOption) val planned = plannedOption.getOrElse( @@ -50,6 +53,8 @@ class PlannerSuite extends SparkFunSuite with SQLTestUtils { } test("unions are collapsed") { + val _ctx = ctx + import _ctx.planner._ val query = testData.unionAll(testData).unionAll(testData).logicalPlan val planned = BasicOperators(query).head val logicalUnions = query collect { case u: logical.Union => u } @@ -77,14 +82,14 @@ class PlannerSuite extends SparkFunSuite with SQLTestUtils { test("sizeInBytes estimation of limit operator for broadcast hash join optimization") { def checkPlan(fieldTypes: Seq[DataType], newThreshold: Int): Unit = { - setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, newThreshold) + ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, newThreshold) val fields = fieldTypes.zipWithIndex.map { case (dataType, index) => StructField(s"c${index}", dataType, true) } :+ StructField("key", IntegerType, true) val schema = StructType(fields) val row = Row.fromSeq(Seq.fill(fields.size)(null)) - val rowRDD = org.apache.spark.sql.test.TestSQLContext.sparkContext.parallelize(row :: Nil) - createDataFrame(rowRDD, schema).registerTempTable("testLimit") + val rowRDD = ctx.sparkContext.parallelize(row :: Nil) + ctx.createDataFrame(rowRDD, schema).registerTempTable("testLimit") val planned = sql( """ @@ -98,10 +103,10 @@ class PlannerSuite extends SparkFunSuite with SQLTestUtils { assert(broadcastHashJoins.size === 1, "Should use broadcast hash join") assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join") - dropTempTable("testLimit") + ctx.dropTempTable("testLimit") } - val origThreshold = conf.autoBroadcastJoinThreshold + val origThreshold = ctx.conf.autoBroadcastJoinThreshold val simpleTypes = NullType :: @@ -133,18 +138,18 @@ class PlannerSuite extends SparkFunSuite with SQLTestUtils { checkPlan(complexTypes, newThreshold = 901617) - setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold) + ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold) } test("InMemoryRelation statistics propagation") { - val origThreshold = conf.autoBroadcastJoinThreshold - setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, 81920) + val origThreshold = ctx.conf.autoBroadcastJoinThreshold + ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, 81920) testData.limit(3).registerTempTable("tiny") sql("CACHE TABLE tiny") val a = testData.as("a") - val b = table("tiny").as("b") + val b = ctx.table("tiny").as("b") val planned = a.join(b, $"a.key" === $"b.key").queryExecution.executedPlan val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join } @@ -153,13 +158,27 @@ class PlannerSuite extends SparkFunSuite with SQLTestUtils { assert(broadcastHashJoins.size === 1, "Should use broadcast hash join") assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join") - setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold) + ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold) } test("efficient limit -> project -> sort") { - val query = testData.sort('key).select('value).limit(2).logicalPlan - val planned = planner.TakeOrderedAndProject(query) - assert(planned.head.isInstanceOf[execution.TakeOrderedAndProject]) + { + val query = + testData.select('key, 'value).sort('key).limit(2).logicalPlan + val planned = ctx.planner.TakeOrderedAndProject(query) + assert(planned.head.isInstanceOf[execution.TakeOrderedAndProject]) + assert(planned.head.output === testData.select('key, 'value).logicalPlan.output) + } + + { + // We need to make sure TakeOrderedAndProject's output is correct when we push a project + // into it. + val query = + testData.select('key, 'value).sort('key).select('value, 'key).limit(2).logicalPlan + val planned = ctx.planner.TakeOrderedAndProject(query) + assert(planned.head.isInstanceOf[execution.TakeOrderedAndProject]) + assert(planned.head.output === testData.select('value, 'key).logicalPlan.output) + } } test("PartitioningCollection") { @@ -202,4 +221,151 @@ class PlannerSuite extends SparkFunSuite with SQLTestUtils { } } } + + // --- Unit tests of EnsureRequirements --------------------------------------------------------- + + // When it comes to testing whether EnsureRequirements properly ensures distribution requirements, + // there two dimensions that need to be considered: are the child partitionings compatible and + // do they satisfy the distribution requirements? As a result, we need at least four test cases. + + private def assertDistributionRequirementsAreSatisfied(outputPlan: SparkPlan): Unit = { + if (outputPlan.children.length > 1 + && outputPlan.requiredChildDistribution.toSet != Set(UnspecifiedDistribution)) { + val childPartitionings = outputPlan.children.map(_.outputPartitioning) + if (!Partitioning.allCompatible(childPartitionings)) { + fail(s"Partitionings are not compatible: $childPartitionings") + } + } + outputPlan.children.zip(outputPlan.requiredChildDistribution).foreach { + case (child, requiredDist) => + assert(child.outputPartitioning.satisfies(requiredDist), + s"$child output partitioning does not satisfy $requiredDist:\n$outputPlan") + } + } + + test("EnsureRequirements with incompatible child partitionings which satisfy distribution") { + // Consider an operator that requires inputs that are clustered by two expressions (e.g. + // sort merge join where there are multiple columns in the equi-join condition) + val clusteringA = Literal(1) :: Nil + val clusteringB = Literal(2) :: Nil + val distribution = ClusteredDistribution(clusteringA ++ clusteringB) + // Say that the left and right inputs are each partitioned by _one_ of the two join columns: + val leftPartitioning = HashPartitioning(clusteringA, 1) + val rightPartitioning = HashPartitioning(clusteringB, 1) + // Individually, each input's partitioning satisfies the clustering distribution: + assert(leftPartitioning.satisfies(distribution)) + assert(rightPartitioning.satisfies(distribution)) + // However, these partitionings are not compatible with each other, so we still need to + // repartition both inputs prior to performing the join: + assert(!leftPartitioning.compatibleWith(rightPartitioning)) + assert(!rightPartitioning.compatibleWith(leftPartitioning)) + val inputPlan = DummySparkPlan( + children = Seq( + DummySparkPlan(outputPartitioning = leftPartitioning), + DummySparkPlan(outputPartitioning = rightPartitioning) + ), + requiredChildDistribution = Seq(distribution, distribution), + requiredChildOrdering = Seq(Seq.empty, Seq.empty) + ) + val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan) + assertDistributionRequirementsAreSatisfied(outputPlan) + if (outputPlan.collect { case Exchange(_, _) => true }.isEmpty) { + fail(s"Exchange should have been added:\n$outputPlan") + } + } + + test("EnsureRequirements with child partitionings with different numbers of output partitions") { + // This is similar to the previous test, except it checks that partitionings are not compatible + // unless they produce the same number of partitions. + val clustering = Literal(1) :: Nil + val distribution = ClusteredDistribution(clustering) + val inputPlan = DummySparkPlan( + children = Seq( + DummySparkPlan(outputPartitioning = HashPartitioning(clustering, 1)), + DummySparkPlan(outputPartitioning = HashPartitioning(clustering, 2)) + ), + requiredChildDistribution = Seq(distribution, distribution), + requiredChildOrdering = Seq(Seq.empty, Seq.empty) + ) + val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan) + assertDistributionRequirementsAreSatisfied(outputPlan) + } + + test("EnsureRequirements with compatible child partitionings that do not satisfy distribution") { + val distribution = ClusteredDistribution(Literal(1) :: Nil) + // The left and right inputs have compatible partitionings but they do not satisfy the + // distribution because they are clustered on different columns. Thus, we need to shuffle. + val childPartitioning = HashPartitioning(Literal(2) :: Nil, 1) + assert(!childPartitioning.satisfies(distribution)) + val inputPlan = DummySparkPlan( + children = Seq( + DummySparkPlan(outputPartitioning = childPartitioning), + DummySparkPlan(outputPartitioning = childPartitioning) + ), + requiredChildDistribution = Seq(distribution, distribution), + requiredChildOrdering = Seq(Seq.empty, Seq.empty) + ) + val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan) + assertDistributionRequirementsAreSatisfied(outputPlan) + if (outputPlan.collect { case Exchange(_, _) => true }.isEmpty) { + fail(s"Exchange should have been added:\n$outputPlan") + } + } + + test("EnsureRequirements with compatible child partitionings that satisfy distribution") { + // In this case, all requirements are satisfied and no exchange should be added. + val distribution = ClusteredDistribution(Literal(1) :: Nil) + val childPartitioning = HashPartitioning(Literal(1) :: Nil, 5) + assert(childPartitioning.satisfies(distribution)) + val inputPlan = DummySparkPlan( + children = Seq( + DummySparkPlan(outputPartitioning = childPartitioning), + DummySparkPlan(outputPartitioning = childPartitioning) + ), + requiredChildDistribution = Seq(distribution, distribution), + requiredChildOrdering = Seq(Seq.empty, Seq.empty) + ) + val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan) + assertDistributionRequirementsAreSatisfied(outputPlan) + if (outputPlan.collect { case Exchange(_, _) => true }.nonEmpty) { + fail(s"Exchange should not have been added:\n$outputPlan") + } + } + + // This is a regression test for SPARK-9703 + test("EnsureRequirements should not repartition if only ordering requirement is unsatisfied") { + // Consider an operator that imposes both output distribution and ordering requirements on its + // children, such as sort sort merge join. If the distribution requirements are satisfied but + // the output ordering requirements are unsatisfied, then the planner should only add sorts and + // should not need to add additional shuffles / exchanges. + val outputOrdering = Seq(SortOrder(Literal(1), Ascending)) + val distribution = ClusteredDistribution(Literal(1) :: Nil) + val inputPlan = DummySparkPlan( + children = Seq( + DummySparkPlan(outputPartitioning = SinglePartition), + DummySparkPlan(outputPartitioning = SinglePartition) + ), + requiredChildDistribution = Seq(distribution, distribution), + requiredChildOrdering = Seq(outputOrdering, outputOrdering) + ) + val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan) + assertDistributionRequirementsAreSatisfied(outputPlan) + if (outputPlan.collect { case Exchange(_, _) => true }.nonEmpty) { + fail(s"No Exchanges should have been added:\n$outputPlan") + } + } + + // --------------------------------------------------------------------------------------------- +} + +// Used for unit-testing EnsureRequirements +private case class DummySparkPlan( + override val children: Seq[SparkPlan] = Nil, + override val outputOrdering: Seq[SortOrder] = Nil, + override val outputPartitioning: Partitioning = UnknownPartitioning(0), + override val requiredChildDistribution: Seq[Distribution] = Nil, + override val requiredChildOrdering: Seq[Seq[SortOrder]] = Nil + ) extends SparkPlan { + override protected def doExecute(): RDD[InternalRow] = throw new NotImplementedError + override def output: Seq[Attribute] = Seq.empty } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala index 707cd9c6d939..ef6ad59b71fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala @@ -17,38 +17,42 @@ package org.apache.spark.sql.execution +import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.expressions.{Literal, IsNull} -import org.apache.spark.sql.test.TestSQLContext +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute, Literal, IsNull} +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types.{GenericArrayData, ArrayType, StringType} +import org.apache.spark.unsafe.types.UTF8String -class RowFormatConvertersSuite extends SparkPlanTest { +class RowFormatConvertersSuite extends SparkPlanTest with SharedSQLContext { private def getConverters(plan: SparkPlan): Seq[SparkPlan] = plan.collect { case c: ConvertToUnsafe => c case c: ConvertToSafe => c } - private val outputsSafe = ExternalSort(Nil, false, PhysicalRDD(Seq.empty, null)) + private val outputsSafe = ExternalSort(Nil, false, PhysicalRDD(Seq.empty, null, "name")) assert(!outputsSafe.outputsUnsafeRows) - private val outputsUnsafe = TungstenSort(Nil, false, PhysicalRDD(Seq.empty, null)) + private val outputsUnsafe = TungstenSort(Nil, false, PhysicalRDD(Seq.empty, null, "name")) assert(outputsUnsafe.outputsUnsafeRows) test("planner should insert unsafe->safe conversions when required") { val plan = Limit(10, outputsUnsafe) - val preparedPlan = TestSQLContext.prepareForExecution.execute(plan) + val preparedPlan = ctx.prepareForExecution.execute(plan) assert(preparedPlan.children.head.isInstanceOf[ConvertToSafe]) } test("filter can process unsafe rows") { val plan = Filter(IsNull(IsNull(Literal(1))), outputsUnsafe) - val preparedPlan = TestSQLContext.prepareForExecution.execute(plan) + val preparedPlan = ctx.prepareForExecution.execute(plan) assert(getConverters(preparedPlan).size === 1) assert(preparedPlan.outputsUnsafeRows) } test("filter can process safe rows") { val plan = Filter(IsNull(IsNull(Literal(1))), outputsSafe) - val preparedPlan = TestSQLContext.prepareForExecution.execute(plan) + val preparedPlan = ctx.prepareForExecution.execute(plan) assert(getConverters(preparedPlan).isEmpty) assert(!preparedPlan.outputsUnsafeRows) } @@ -63,28 +67,62 @@ class RowFormatConvertersSuite extends SparkPlanTest { test("union requires all of its input rows' formats to agree") { val plan = Union(Seq(outputsSafe, outputsUnsafe)) assert(plan.canProcessSafeRows && plan.canProcessUnsafeRows) - val preparedPlan = TestSQLContext.prepareForExecution.execute(plan) + val preparedPlan = ctx.prepareForExecution.execute(plan) assert(preparedPlan.outputsUnsafeRows) } test("union can process safe rows") { val plan = Union(Seq(outputsSafe, outputsSafe)) - val preparedPlan = TestSQLContext.prepareForExecution.execute(plan) + val preparedPlan = ctx.prepareForExecution.execute(plan) assert(!preparedPlan.outputsUnsafeRows) } test("union can process unsafe rows") { val plan = Union(Seq(outputsUnsafe, outputsUnsafe)) - val preparedPlan = TestSQLContext.prepareForExecution.execute(plan) + val preparedPlan = ctx.prepareForExecution.execute(plan) assert(preparedPlan.outputsUnsafeRows) } test("round trip with ConvertToUnsafe and ConvertToSafe") { val input = Seq(("hello", 1), ("world", 2)) checkAnswer( - TestSQLContext.createDataFrame(input), + ctx.createDataFrame(input), plan => ConvertToSafe(ConvertToUnsafe(plan)), input.map(Row.fromTuple) ) } + + test("SPARK-9683: copy UTF8String when convert unsafe array/map to safe") { + SparkPlan.currentContext.set(ctx) + val schema = ArrayType(StringType) + val rows = (1 to 100).map { i => + InternalRow(new GenericArrayData(Array[Any](UTF8String.fromString(i.toString)))) + } + val relation = LocalTableScan(Seq(AttributeReference("t", schema)()), rows) + + val plan = + DummyPlan( + ConvertToSafe( + ConvertToUnsafe(relation))) + assert(plan.execute().collect().map(_.getUTF8String(0).toString) === (1 to 100).map(_.toString)) + } +} + +case class DummyPlan(child: SparkPlan) extends UnaryNode { + + override protected def doExecute(): RDD[InternalRow] = { + child.execute().mapPartitions { iter => + // This `DummyPlan` is in safe mode, so we don't need to do copy even we hold some + // values gotten from the incoming rows. + // we cache all strings here to make sure we have deep copied UTF8String inside incoming + // safe InternalRow. + val strings = new scala.collection.mutable.ArrayBuffer[UTF8String] + iter.foreach { row => + strings += row.getArray(0).getUTF8String(0) + } + strings.map(InternalRow(_)).iterator + } + } + + override def output: Seq[Attribute] = Seq(AttributeReference("a", StringType)()) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala index a2c10fdaf6cd..8fa77b0fcb7b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala @@ -19,8 +19,9 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.test.SharedSQLContext -class SortSuite extends SparkPlanTest { +class SortSuite extends SparkPlanTest with SharedSQLContext { // This test was originally added as an example of how to use [[SparkPlanTest]]; // it's not designed to be a comprehensive test of ExternalSort. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala index f46855edfe0d..3a87f374d94b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala @@ -17,29 +17,27 @@ package org.apache.spark.sql.execution -import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.catalyst.util._ -import org.apache.spark.sql.test.TestSQLContext -import org.apache.spark.sql.{SQLContext, DataFrame, DataFrameHolder, Row} - import scala.language.implicitConversions import scala.reflect.runtime.universe.TypeTag import scala.util.control.NonFatal +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.{DataFrame, DataFrameHolder, Row, SQLContext} +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.util._ + /** * Base class for writing tests for individual physical operators. For an example of how this * class's test helper methods can be used, see [[SortSuite]]. */ -class SparkPlanTest extends SparkFunSuite { - - protected def sqlContext: SQLContext = TestSQLContext +private[sql] abstract class SparkPlanTest extends SparkFunSuite { + protected def _sqlContext: SQLContext /** * Creates a DataFrame from a local Seq of Product. */ implicit def localSeqToDataFrameHolder[A <: Product : TypeTag](data: Seq[A]): DataFrameHolder = { - sqlContext.implicits.localSeqToDataFrameHolder(data) + _sqlContext.implicits.localSeqToDataFrameHolder(data) } /** @@ -100,7 +98,7 @@ class SparkPlanTest extends SparkFunSuite { planFunction: Seq[SparkPlan] => SparkPlan, expectedAnswer: Seq[Row], sortAnswers: Boolean = true): Unit = { - SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer, sortAnswers, sqlContext) match { + SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer, sortAnswers, _sqlContext) match { case Some(errorMessage) => fail(errorMessage) case None => } @@ -124,7 +122,7 @@ class SparkPlanTest extends SparkFunSuite { expectedPlanFunction: SparkPlan => SparkPlan, sortAnswers: Boolean = true): Unit = { SparkPlanTest.checkAnswer( - input, planFunction, expectedPlanFunction, sortAnswers, sqlContext) match { + input, planFunction, expectedPlanFunction, sortAnswers, _sqlContext) match { case Some(errorMessage) => fail(errorMessage) case None => } @@ -151,13 +149,13 @@ object SparkPlanTest { planFunction: SparkPlan => SparkPlan, expectedPlanFunction: SparkPlan => SparkPlan, sortAnswers: Boolean, - sqlContext: SQLContext): Option[String] = { + _sqlContext: SQLContext): Option[String] = { val outputPlan = planFunction(input.queryExecution.sparkPlan) val expectedOutputPlan = expectedPlanFunction(input.queryExecution.sparkPlan) val expectedAnswer: Seq[Row] = try { - executePlan(expectedOutputPlan, sqlContext) + executePlan(expectedOutputPlan, _sqlContext) } catch { case NonFatal(e) => val errorMessage = @@ -172,7 +170,7 @@ object SparkPlanTest { } val actualAnswer: Seq[Row] = try { - executePlan(outputPlan, sqlContext) + executePlan(outputPlan, _sqlContext) } catch { case NonFatal(e) => val errorMessage = @@ -212,12 +210,12 @@ object SparkPlanTest { planFunction: Seq[SparkPlan] => SparkPlan, expectedAnswer: Seq[Row], sortAnswers: Boolean, - sqlContext: SQLContext): Option[String] = { + _sqlContext: SQLContext): Option[String] = { val outputPlan = planFunction(input.map(_.queryExecution.sparkPlan)) val sparkAnswer: Seq[Row] = try { - executePlan(outputPlan, sqlContext) + executePlan(outputPlan, _sqlContext) } catch { case NonFatal(e) => val errorMessage = @@ -280,10 +278,10 @@ object SparkPlanTest { } } - private def executePlan(outputPlan: SparkPlan, sqlContext: SQLContext): Seq[Row] = { + private def executePlan(outputPlan: SparkPlan, _sqlContext: SQLContext): Seq[Row] = { // A very simple resolver to make writing tests easier. In contrast to the real resolver // this is always case sensitive and does not try to handle scoping or complex type resolution. - val resolvedPlan = sqlContext.prepareForExecution.execute( + val resolvedPlan = _sqlContext.prepareForExecution.execute( outputPlan transform { case plan: SparkPlan => val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala deleted file mode 100644 index 7978ed57a937..000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution - -import java.sql.{Timestamp, Date} - -import org.apache.spark.sql.test.TestSQLContext -import org.scalatest.BeforeAndAfterAll - -import org.apache.spark.rdd.ShuffledRDD -import org.apache.spark.serializer.Serializer -import org.apache.spark.{ShuffleDependency, SparkFunSuite} -import org.apache.spark.sql.types._ -import org.apache.spark.sql.Row -import org.apache.spark.sql.{MyDenseVectorUDT, QueryTest} - -class SparkSqlSerializer2DataTypeSuite extends SparkFunSuite { - // Make sure that we will not use serializer2 for unsupported data types. - def checkSupported(dataType: DataType, isSupported: Boolean): Unit = { - val testName = - s"${if (dataType == null) null else dataType.toString} is " + - s"${if (isSupported) "supported" else "unsupported"}" - - test(testName) { - assert(SparkSqlSerializer2.support(Array(dataType)) === isSupported) - } - } - - checkSupported(null, isSupported = true) - checkSupported(BooleanType, isSupported = true) - checkSupported(ByteType, isSupported = true) - checkSupported(ShortType, isSupported = true) - checkSupported(IntegerType, isSupported = true) - checkSupported(LongType, isSupported = true) - checkSupported(FloatType, isSupported = true) - checkSupported(DoubleType, isSupported = true) - checkSupported(DateType, isSupported = true) - checkSupported(TimestampType, isSupported = true) - checkSupported(StringType, isSupported = true) - checkSupported(BinaryType, isSupported = true) - checkSupported(DecimalType(10, 5), isSupported = true) - checkSupported(DecimalType.SYSTEM_DEFAULT, isSupported = true) - - // If NullType is the only data type in the schema, we do not support it. - checkSupported(NullType, isSupported = false) - // For now, ArrayType, MapType, and StructType are not supported. - checkSupported(ArrayType(DoubleType, true), isSupported = false) - checkSupported(ArrayType(StringType, false), isSupported = false) - checkSupported(MapType(IntegerType, StringType, true), isSupported = false) - checkSupported(MapType(IntegerType, ArrayType(DoubleType), false), isSupported = false) - checkSupported(StructType(StructField("a", IntegerType, true) :: Nil), isSupported = false) - // UDTs are not supported right now. - checkSupported(new MyDenseVectorUDT, isSupported = false) -} - -abstract class SparkSqlSerializer2Suite extends QueryTest with BeforeAndAfterAll { - var allColumns: String = _ - val serializerClass: Class[Serializer] = - classOf[SparkSqlSerializer2].asInstanceOf[Class[Serializer]] - var numShufflePartitions: Int = _ - var useSerializer2: Boolean = _ - - protected lazy val ctx = TestSQLContext - - override def beforeAll(): Unit = { - numShufflePartitions = ctx.conf.numShufflePartitions - useSerializer2 = ctx.conf.useSqlSerializer2 - - ctx.sql("set spark.sql.useSerializer2=true") - - val supportedTypes = - Seq(StringType, BinaryType, NullType, BooleanType, - ByteType, ShortType, IntegerType, LongType, - FloatType, DoubleType, DecimalType.SYSTEM_DEFAULT, DecimalType(6, 5), - DateType, TimestampType) - - val fields = supportedTypes.zipWithIndex.map { case (dataType, index) => - StructField(s"col$index", dataType, true) - } - allColumns = fields.map(_.name).mkString(",") - val schema = StructType(fields) - - // Create a RDD with all data types supported by SparkSqlSerializer2. - val rdd = - ctx.sparkContext.parallelize((1 to 1000), 10).map { i => - Row( - s"str${i}: test serializer2.", - s"binary${i}: test serializer2.".getBytes("UTF-8"), - null, - i % 2 == 0, - i.toByte, - i.toShort, - i, - Long.MaxValue - i.toLong, - (i + 0.25).toFloat, - (i + 0.75), - BigDecimal(Long.MaxValue.toString + ".12345"), - new java.math.BigDecimal(s"${i % 9 + 1}" + ".23456"), - new Date(i), - new Timestamp(i)) - } - - ctx.createDataFrame(rdd, schema).registerTempTable("shuffle") - - super.beforeAll() - } - - override def afterAll(): Unit = { - ctx.dropTempTable("shuffle") - ctx.sql(s"set spark.sql.shuffle.partitions=$numShufflePartitions") - ctx.sql(s"set spark.sql.useSerializer2=$useSerializer2") - super.afterAll() - } - - def checkSerializer[T <: Serializer]( - executedPlan: SparkPlan, - expectedSerializerClass: Class[T]): Unit = { - executedPlan.foreach { - case exchange: Exchange => - val shuffledRDD = exchange.execute() - val dependency = shuffledRDD.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] - val serializerNotSetMessage = - s"Expected $expectedSerializerClass as the serializer of Exchange. " + - s"However, the serializer was not set." - val serializer = dependency.serializer.getOrElse(fail(serializerNotSetMessage)) - val isExpectedSerializer = - serializer.getClass == expectedSerializerClass || - serializer.getClass == classOf[UnsafeRowSerializer] - val wrongSerializerErrorMessage = - s"Expected ${expectedSerializerClass.getCanonicalName} or " + - s"${classOf[UnsafeRowSerializer].getCanonicalName}. But " + - s"${serializer.getClass.getCanonicalName} is used." - assert(isExpectedSerializer, wrongSerializerErrorMessage) - case _ => // Ignore other nodes. - } - } - - test("key schema and value schema are not nulls") { - val df = ctx.sql(s"SELECT DISTINCT ${allColumns} FROM shuffle") - checkSerializer(df.queryExecution.executedPlan, serializerClass) - checkAnswer( - df, - ctx.table("shuffle").collect()) - } - - test("key schema is null") { - val aggregations = allColumns.split(",").map(c => s"COUNT($c)").mkString(",") - val df = ctx.sql(s"SELECT $aggregations FROM shuffle") - checkSerializer(df.queryExecution.executedPlan, serializerClass) - checkAnswer( - df, - Row(1000, 1000, 0, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000)) - } - - test("value schema is null") { - val df = ctx.sql(s"SELECT col0 FROM shuffle ORDER BY col0") - checkSerializer(df.queryExecution.executedPlan, serializerClass) - assert(df.map(r => r.getString(0)).collect().toSeq === - ctx.table("shuffle").select("col0").map(r => r.getString(0)).collect().sorted.toSeq) - } - - test("no map output field") { - val df = ctx.sql(s"SELECT 1 + 1 FROM shuffle") - checkSerializer(df.queryExecution.executedPlan, classOf[SparkSqlSerializer]) - } - - test("types of fields are all NullTypes") { - // Test range partitioning code path. - val nulls = ctx.sql(s"SELECT null as a, null as b, null as c") - val df = nulls.unionAll(nulls).sort("a") - checkSerializer(df.queryExecution.executedPlan, classOf[SparkSqlSerializer]) - checkAnswer( - df, - Row(null, null, null) :: Row(null, null, null) :: Nil) - - // Test hash partitioning code path. - val oneRow = ctx.sql(s"SELECT DISTINCT null, null, null FROM shuffle") - checkSerializer(oneRow.queryExecution.executedPlan, classOf[SparkSqlSerializer]) - checkAnswer( - oneRow, - Row(null, null, null)) - } -} - -/** Tests SparkSqlSerializer2 with sort based shuffle without sort merge. */ -class SparkSqlSerializer2SortShuffleSuite extends SparkSqlSerializer2Suite { - override def beforeAll(): Unit = { - super.beforeAll() - // Sort merge will not be triggered. - val bypassMergeThreshold = - ctx.sparkContext.conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) - ctx.sql(s"set spark.sql.shuffle.partitions=${bypassMergeThreshold-1}") - } -} - -/** For now, we will use SparkSqlSerializer for sort based shuffle with sort merge. */ -class SparkSqlSerializer2SortMergeShuffleSuite extends SparkSqlSerializer2Suite { - - override def beforeAll(): Unit = { - super.beforeAll() - // To trigger the sort merge. - val bypassMergeThreshold = - ctx.sparkContext.conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) - ctx.sql(s"set spark.sql.shuffle.partitions=${bypassMergeThreshold + 1}") - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala index 53de2d0f0771..48c3938ff87b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala @@ -22,7 +22,7 @@ import org.apache.spark.shuffle.ShuffleMemoryManager /** * A [[ShuffleMemoryManager]] that can be controlled to run out of memory. */ -class TestShuffleMemoryManager extends ShuffleMemoryManager(Long.MaxValue) { +class TestShuffleMemoryManager extends ShuffleMemoryManager(Long.MaxValue, 4 * 1024 * 1024) { private var oom = false override def tryToAcquire(numBytes: Long): Long = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala index c7949848513c..3158458edb83 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala @@ -19,24 +19,28 @@ package org.apache.spark.sql.execution import scala.util.Random -import org.scalatest.BeforeAndAfterAll - +import org.apache.spark.AccumulatorSuite import org.apache.spark.sql.{RandomDataGenerator, Row, SQLConf} import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.test.TestSQLContext +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ /** * A test suite that generates randomized data to test the [[TungstenSort]] operator. */ -class TungstenSortSuite extends SparkPlanTest with BeforeAndAfterAll { +class TungstenSortSuite extends SparkPlanTest with SharedSQLContext { override def beforeAll(): Unit = { - TestSQLContext.conf.setConf(SQLConf.CODEGEN_ENABLED, true) + super.beforeAll() + ctx.conf.setConf(SQLConf.CODEGEN_ENABLED, true) } override def afterAll(): Unit = { - TestSQLContext.conf.setConf(SQLConf.CODEGEN_ENABLED, SQLConf.CODEGEN_ENABLED.defaultValue.get) + try { + ctx.conf.setConf(SQLConf.CODEGEN_ENABLED, SQLConf.CODEGEN_ENABLED.defaultValue.get) + } finally { + super.afterAll() + } } test("sort followed by limit") { @@ -59,6 +63,17 @@ class TungstenSortSuite extends SparkPlanTest with BeforeAndAfterAll { ) } + test("sorting updates peak execution memory") { + val sc = ctx.sparkContext + AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "unsafe external sort") { + checkThatPlansAgree( + (1 to 100).map(v => Tuple1(v)).toDF("a"), + (child: SparkPlan) => TungstenSort('a.asc :: Nil, true, child), + (child: SparkPlan) => Sort('a.asc :: Nil, global = true, child), + sortAnswers = false) + } + } + // Test sorting on different data types for ( dataType <- DataTypeTestUtils.atomicTypes ++ Set(NullType); @@ -68,8 +83,8 @@ class TungstenSortSuite extends SparkPlanTest with BeforeAndAfterAll { ) { test(s"sorting on $dataType with nullable=$nullable, sortOrder=$sortOrder") { val inputData = Seq.fill(1000)(randomDataGenerator()) - val inputDf = TestSQLContext.createDataFrame( - TestSQLContext.sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))), + val inputDf = ctx.createDataFrame( + ctx.sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))), StructType(StructField("a", dataType, nullable = true) :: Nil) ) assert(TungstenSort.supportsSchema(inputDf.schema)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala index 7c591f6143b9..d1f0b2b1fc52 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala @@ -23,10 +23,10 @@ import scala.util.{Try, Random} import org.scalatest.Matchers -import org.apache.spark.sql.catalyst.expressions.UnsafeProjection +import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection} import org.apache.spark.{TaskContextImpl, TaskContext, SparkFunSuite} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.test.TestSQLContext +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager} import org.apache.spark.unsafe.types.UTF8String @@ -36,7 +36,10 @@ import org.apache.spark.unsafe.types.UTF8String * * Use [[testWithMemoryLeakDetection]] rather than [[test]] to construct test cases. */ -class UnsafeFixedWidthAggregationMapSuite extends SparkFunSuite with Matchers { +class UnsafeFixedWidthAggregationMapSuite + extends SparkFunSuite + with Matchers + with SharedSQLContext { import UnsafeFixedWidthAggregationMap._ @@ -69,7 +72,8 @@ class UnsafeFixedWidthAggregationMapSuite extends SparkFunSuite with Matchers { taskAttemptId = Random.nextInt(10000), attemptNumber = 0, taskMemoryManager = taskMemoryManager, - metricsSystem = null)) + metricsSystem = null, + internalAccumulators = Seq.empty)) try { f @@ -92,7 +96,7 @@ class UnsafeFixedWidthAggregationMapSuite extends SparkFunSuite with Matchers { testWithMemoryLeakDetection("supported schemas") { assert(supportsAggregationBufferSchema( StructType(StructField("x", DecimalType.USER_DEFAULT) :: Nil))) - assert(!supportsAggregationBufferSchema( + assert(supportsAggregationBufferSchema( StructType(StructField("x", DecimalType.SYSTEM_DEFAULT) :: Nil))) assert(!supportsAggregationBufferSchema(StructType(StructField("x", StringType) :: Nil))) assert( @@ -170,9 +174,6 @@ class UnsafeFixedWidthAggregationMapSuite extends SparkFunSuite with Matchers { } testWithMemoryLeakDetection("test external sorting") { - // Calling this make sure we have block manager and everything else setup. - TestSQLContext - // Memory consumption in the beginning of the task. val initialMemoryConsumption = shuffleMemoryManager.getMemoryConsumptionForThisTask() @@ -230,4 +231,106 @@ class UnsafeFixedWidthAggregationMapSuite extends SparkFunSuite with Matchers { map.free() } + + testWithMemoryLeakDetection("test external sorting with an empty map") { + + val map = new UnsafeFixedWidthAggregationMap( + emptyAggregationBuffer, + aggBufferSchema, + groupKeySchema, + taskMemoryManager, + shuffleMemoryManager, + 128, // initial capacity + PAGE_SIZE_BYTES, + false // disable perf metrics + ) + + // Convert the map into a sorter + val sorter = map.destructAndCreateExternalSorter() + + // Add more keys to the sorter and make sure the results come out sorted. + val additionalKeys = randomStrings(1024) + val keyConverter = UnsafeProjection.create(groupKeySchema) + val valueConverter = UnsafeProjection.create(aggBufferSchema) + + additionalKeys.zipWithIndex.foreach { case (str, i) => + val k = InternalRow(UTF8String.fromString(str)) + val v = InternalRow(str.length) + sorter.insertKV(keyConverter.apply(k), valueConverter.apply(v)) + + if ((i % 100) == 0) { + shuffleMemoryManager.markAsOutOfMemory() + sorter.closeCurrentPage() + } + } + + val out = new scala.collection.mutable.ArrayBuffer[String] + val iter = sorter.sortedIterator() + while (iter.next()) { + // At here, we also test if copy is correct. + val key = iter.getKey.copy() + val value = iter.getValue.copy() + assert(key.getString(0).length === value.getInt(0)) + out += key.getString(0) + } + + assert(out === (additionalKeys).sorted) + + map.free() + } + + testWithMemoryLeakDetection("test external sorting with empty records") { + + // Memory consumption in the beginning of the task. + val initialMemoryConsumption = shuffleMemoryManager.getMemoryConsumptionForThisTask() + + val map = new UnsafeFixedWidthAggregationMap( + emptyAggregationBuffer, + StructType(Nil), + StructType(Nil), + taskMemoryManager, + shuffleMemoryManager, + 128, // initial capacity + PAGE_SIZE_BYTES, + false // disable perf metrics + ) + + (1 to 10).foreach { i => + val buf = map.getAggregationBuffer(UnsafeRow.createFromByteArray(0, 0)) + assert(buf != null) + } + + // Convert the map into a sorter. Right now, it contains one record. + val sorter = map.destructAndCreateExternalSorter() + + withClue(s"destructAndCreateExternalSorter should release memory used by the map") { + // 4096 * 16 is the initial size allocated for the pointer/prefix array in the in-mem sorter. + assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() === + initialMemoryConsumption + 4096 * 16) + } + + // Add more keys to the sorter and make sure the results come out sorted. + (1 to 4096).foreach { i => + sorter.insertKV(UnsafeRow.createFromByteArray(0, 0), UnsafeRow.createFromByteArray(0, 0)) + + if ((i % 100) == 0) { + shuffleMemoryManager.markAsOutOfMemory() + sorter.closeCurrentPage() + } + } + + var count = 0 + val iter = sorter.sortedIterator() + while (iter.next()) { + // At here, we also test if copy is correct. + iter.getKey.copy() + iter.getValue.copy() + count += 1; + } + + // 1 record was from the map and 4096 records were explicitly inserted. + assert(count === 4097) + + map.free() + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala index 0282b25b9dd5..d3be568a8758 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala @@ -20,18 +20,17 @@ package org.apache.spark.sql.execution import scala.util.Random import org.apache.spark._ -import org.apache.spark.sql.RandomDataGenerator +import org.apache.spark.sql.{RandomDataGenerator, Row} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} -import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, RowOrdering, UnsafeProjection} -import org.apache.spark.sql.test.TestSQLContext +import org.apache.spark.sql.catalyst.expressions.{InterpretedOrdering, UnsafeRow, UnsafeProjection} +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager} /** * Test suite for [[UnsafeKVExternalSorter]], with randomly generated test data. */ -class UnsafeKVExternalSorterSuite extends SparkFunSuite { - +class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext { private val keyTypes = Seq(IntegerType, FloatType, DoubleType, StringType) private val valueTypes = Seq(IntegerType, FloatType, DoubleType, StringType) @@ -46,6 +45,7 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite { testKVSorter(keySchema, valueSchema, spill = i > 3) } + /** * Create a test case using randomly generated data for the given key and value schema. * @@ -60,95 +60,149 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite { * If spill is set to true, the sorter will spill probabilistically roughly every 100 records. */ private def testKVSorter(keySchema: StructType, valueSchema: StructType, spill: Boolean): Unit = { + // Create the data converters + val kExternalConverter = CatalystTypeConverters.createToCatalystConverter(keySchema) + val vExternalConverter = CatalystTypeConverters.createToCatalystConverter(valueSchema) + val kConverter = UnsafeProjection.create(keySchema) + val vConverter = UnsafeProjection.create(valueSchema) + + val keyDataGen = RandomDataGenerator.forType(keySchema, nullable = false).get + val valueDataGen = RandomDataGenerator.forType(valueSchema, nullable = false).get + + val inputData = Seq.fill(1024) { + val k = kConverter(kExternalConverter.apply(keyDataGen.apply()).asInstanceOf[InternalRow]) + val v = vConverter(vExternalConverter.apply(valueDataGen.apply()).asInstanceOf[InternalRow]) + (k.asInstanceOf[InternalRow].copy(), v.asInstanceOf[InternalRow].copy()) + } val keySchemaStr = keySchema.map(_.dataType.simpleString).mkString("[", ",", "]") val valueSchemaStr = valueSchema.map(_.dataType.simpleString).mkString("[", ",", "]") test(s"kv sorting key schema $keySchemaStr and value schema $valueSchemaStr") { - // Calling this make sure we have block manager and everything else setup. - TestSQLContext - - val taskMemMgr = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP)) - val shuffleMemMgr = new TestShuffleMemoryManager - TaskContext.setTaskContext(new TaskContextImpl( - stageId = 0, - partitionId = 0, - taskAttemptId = 98456, - attemptNumber = 0, - taskMemoryManager = taskMemMgr, - metricsSystem = null)) - - // Create the data converters - val kExternalConverter = CatalystTypeConverters.createToCatalystConverter(keySchema) - val vExternalConverter = CatalystTypeConverters.createToCatalystConverter(valueSchema) - val kConverter = UnsafeProjection.create(keySchema) - val vConverter = UnsafeProjection.create(valueSchema) - - val keyDataGen = RandomDataGenerator.forType(keySchema, nullable = false).get - val valueDataGen = RandomDataGenerator.forType(valueSchema, nullable = false).get - - val input = Seq.fill(1024) { - val k = kConverter(kExternalConverter.apply(keyDataGen.apply()).asInstanceOf[InternalRow]) - val v = vConverter(vExternalConverter.apply(valueDataGen.apply()).asInstanceOf[InternalRow]) - (k.asInstanceOf[InternalRow].copy(), v.asInstanceOf[InternalRow].copy()) - } - - val sorter = new UnsafeKVExternalSorter( - keySchema, valueSchema, SparkEnv.get.blockManager, shuffleMemMgr, 16 * 1024 * 1024) - - // Insert generated keys and values into the sorter - input.foreach { case (k, v) => - sorter.insertKV(k.asInstanceOf[UnsafeRow], v.asInstanceOf[UnsafeRow]) - // 1% chance we will spill - if (rand.nextDouble() < 0.01 && spill) { - shuffleMemMgr.markAsOutOfMemory() - sorter.closeCurrentPage() - } - } + testKVSorter( + keySchema, + valueSchema, + inputData, + pageSize = 16 * 1024 * 1024, + spill + ) + } + } - // Collect the sorted output - val out = new scala.collection.mutable.ArrayBuffer[(InternalRow, InternalRow)] - val iter = sorter.sortedIterator() - while (iter.next()) { - out += Tuple2(iter.getKey.copy(), iter.getValue.copy()) + /** + * Create a test case using the given input data for the given key and value schema. + * + * The approach works as follows: + * + * - Create input by randomly generating data based on the given schema + * - Run [[UnsafeKVExternalSorter]] on the input data + * - Collect the output from the sorter, and make sure the keys are sorted in ascending order + * - Sort the input by both key and value, and sort the sorter output also by both key and value. + * Compare the sorted input and sorted output together to make sure all the key/values match. + * + * If spill is set to true, the sorter will spill probabilistically roughly every 100 records. + */ + private def testKVSorter( + keySchema: StructType, + valueSchema: StructType, + inputData: Seq[(InternalRow, InternalRow)], + pageSize: Long, + spill: Boolean): Unit = { + + val taskMemMgr = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP)) + val shuffleMemMgr = new TestShuffleMemoryManager + TaskContext.setTaskContext(new TaskContextImpl( + stageId = 0, + partitionId = 0, + taskAttemptId = 98456, + attemptNumber = 0, + taskMemoryManager = taskMemMgr, + metricsSystem = null, + internalAccumulators = Seq.empty)) + + val sorter = new UnsafeKVExternalSorter( + keySchema, valueSchema, SparkEnv.get.blockManager, shuffleMemMgr, pageSize) + + // Insert the keys and values into the sorter + inputData.foreach { case (k, v) => + sorter.insertKV(k.asInstanceOf[UnsafeRow], v.asInstanceOf[UnsafeRow]) + // 1% chance we will spill + if (rand.nextDouble() < 0.01 && spill) { + shuffleMemMgr.markAsOutOfMemory() + sorter.closeCurrentPage() } + } - val keyOrdering = RowOrdering.forSchema(keySchema.map(_.dataType)) - val valueOrdering = RowOrdering.forSchema(valueSchema.map(_.dataType)) - val kvOrdering = new Ordering[(InternalRow, InternalRow)] { - override def compare(x: (InternalRow, InternalRow), y: (InternalRow, InternalRow)): Int = { - keyOrdering.compare(x._1, y._1) match { - case 0 => valueOrdering.compare(x._2, y._2) - case cmp => cmp - } + // Collect the sorted output + val out = new scala.collection.mutable.ArrayBuffer[(InternalRow, InternalRow)] + val iter = sorter.sortedIterator() + while (iter.next()) { + out += Tuple2(iter.getKey.copy(), iter.getValue.copy()) + } + sorter.cleanupResources() + + val keyOrdering = InterpretedOrdering.forSchema(keySchema.map(_.dataType)) + val valueOrdering = InterpretedOrdering.forSchema(valueSchema.map(_.dataType)) + val kvOrdering = new Ordering[(InternalRow, InternalRow)] { + override def compare(x: (InternalRow, InternalRow), y: (InternalRow, InternalRow)): Int = { + keyOrdering.compare(x._1, y._1) match { + case 0 => valueOrdering.compare(x._2, y._2) + case cmp => cmp } } + } - // Testing to make sure output from the sorter is sorted by key - var prevK: InternalRow = null - out.zipWithIndex.foreach { case ((k, v), i) => - if (prevK != null) { - assert(keyOrdering.compare(prevK, k) <= 0, - s""" - |key is not in sorted order: - |previous key: $prevK - |current key : $k - """.stripMargin) - } - prevK = k + // Testing to make sure output from the sorter is sorted by key + var prevK: InternalRow = null + out.zipWithIndex.foreach { case ((k, v), i) => + if (prevK != null) { + assert(keyOrdering.compare(prevK, k) <= 0, + s""" + |key is not in sorted order: + |previous key: $prevK + |current key : $k + """.stripMargin) } + prevK = k + } - // Testing to make sure the key/value in output matches input - assert(out.sorted(kvOrdering) === input.sorted(kvOrdering)) + // Testing to make sure the key/value in output matches input + assert(out.sorted(kvOrdering) === inputData.sorted(kvOrdering)) - // Make sure there is no memory leak - val leakedUnsafeMemory: Long = taskMemMgr.cleanUpAllAllocatedMemory - if (shuffleMemMgr != null) { - val leakedShuffleMemory: Long = shuffleMemMgr.getMemoryConsumptionForThisTask() - assert(0L === leakedShuffleMemory) - } - assert(0 === leakedUnsafeMemory) - TaskContext.unset() + // Make sure there is no memory leak + val leakedUnsafeMemory: Long = taskMemMgr.cleanUpAllAllocatedMemory + if (shuffleMemMgr != null) { + val leakedShuffleMemory: Long = shuffleMemMgr.getMemoryConsumptionForThisTask() + assert(0L === leakedShuffleMemory) } + assert(0 === leakedUnsafeMemory) + TaskContext.unset() + } + + test("kv sorting with records that exceed page size") { + val pageSize = 128 + + val schema = StructType(StructField("b", BinaryType) :: Nil) + val externalConverter = CatalystTypeConverters.createToCatalystConverter(schema) + val converter = UnsafeProjection.create(schema) + + val rand = new Random() + val inputData = Seq.fill(1024) { + val kBytes = new Array[Byte](rand.nextInt(pageSize)) + val vBytes = new Array[Byte](rand.nextInt(pageSize)) + rand.nextBytes(kBytes) + rand.nextBytes(vBytes) + val k = converter(externalConverter.apply(Row(kBytes)).asInstanceOf[InternalRow]) + val v = converter(externalConverter.apply(Row(vBytes)).asInstanceOf[InternalRow]) + (k.asInstanceOf[InternalRow].copy(), v.asInstanceOf[InternalRow].copy()) + } + + testKVSorter( + schema, + schema, + inputData, + pageSize, + spill = true + ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala index 40b47ae18d64..0113d052e338 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala @@ -17,20 +17,42 @@ package org.apache.spark.sql.execution -import java.io.{ByteArrayInputStream, ByteArrayOutputStream} +import java.io.{File, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream} -import org.apache.spark.SparkFunSuite +import org.apache.spark.executor.ShuffleWriteMetrics +import org.apache.spark.storage.ShuffleBlockId +import org.apache.spark.util.collection.ExternalSorter +import org.apache.spark.util.Utils import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow} import org.apache.spark.sql.types._ +import org.apache.spark._ + + +/** + * used to test close InputStream in UnsafeRowSerializer + */ +class ClosableByteArrayInputStream(buf: Array[Byte]) extends ByteArrayInputStream(buf) { + var closed: Boolean = false + override def close(): Unit = { + closed = true + super.close() + } +} class UnsafeRowSerializerSuite extends SparkFunSuite { private def toUnsafeRow(row: Row, schema: Array[DataType]): UnsafeRow = { - val internalRow = CatalystTypeConverters.convertToCatalyst(row).asInstanceOf[InternalRow] + val converter = unsafeRowConverter(schema) + converter(row) + } + + private def unsafeRowConverter(schema: Array[DataType]): Row => UnsafeRow = { val converter = UnsafeProjection.create(schema) - converter.apply(internalRow) + (row: Row) => { + converter(CatalystTypeConverters.convertToCatalyst(row).asInstanceOf[InternalRow]) + } } test("toUnsafeRow() test helper method") { @@ -52,8 +74,8 @@ class UnsafeRowSerializerSuite extends SparkFunSuite { serializerStream.writeValue(unsafeRow) } serializerStream.close() - val deserializerIter = serializer.deserializeStream( - new ByteArrayInputStream(baos.toByteArray)).asKeyValueIterator + val input = new ClosableByteArrayInputStream(baos.toByteArray) + val deserializerIter = serializer.deserializeStream(input).asKeyValueIterator for (expectedRow <- unsafeRows) { val actualRow = deserializerIter.next().asInstanceOf[(Integer, UnsafeRow)]._2 assert(expectedRow.getSizeInBytes === actualRow.getSizeInBytes) @@ -61,5 +83,64 @@ class UnsafeRowSerializerSuite extends SparkFunSuite { assert(expectedRow.getInt(1) === actualRow.getInt(1)) } assert(!deserializerIter.hasNext) + assert(input.closed) + } + + test("close empty input stream") { + val baos = new ByteArrayOutputStream() + val dout = new DataOutputStream(baos) + dout.writeInt(-1) // EOF + dout.flush() + val input = new ClosableByteArrayInputStream(baos.toByteArray) + val serializer = new UnsafeRowSerializer(numFields = 2).newInstance() + val deserializerIter = serializer.deserializeStream(input).asKeyValueIterator + assert(!deserializerIter.hasNext) + assert(input.closed) + } + + test("SPARK-10466: external sorter spilling with unsafe row serializer") { + var sc: SparkContext = null + var outputFile: File = null + val oldEnv = SparkEnv.get // save the old SparkEnv, as it will be overwritten + Utils.tryWithSafeFinally { + val conf = new SparkConf() + .set("spark.shuffle.spill.initialMemoryThreshold", "1024") + .set("spark.shuffle.sort.bypassMergeThreshold", "0") + .set("spark.shuffle.memoryFraction", "0.0001") + + sc = new SparkContext("local", "test", conf) + outputFile = File.createTempFile("test-unsafe-row-serializer-spill", "") + // prepare data + val converter = unsafeRowConverter(Array(IntegerType)) + val data = (1 to 1000).iterator.map { i => + (i, converter(Row(i))) + } + val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow]( + partitioner = Some(new HashPartitioner(10)), + serializer = Some(new UnsafeRowSerializer(numFields = 1))) + + // Ensure we spilled something and have to merge them later + assert(sorter.numSpills === 0) + sorter.insertAll(data) + assert(sorter.numSpills > 0) + + // Merging spilled files should not throw assertion error + val taskContext = + new TaskContextImpl(0, 0, 0, 0, null, null, InternalAccumulator.create(sc)) + taskContext.taskMetrics.shuffleWriteMetrics = Some(new ShuffleWriteMetrics) + sorter.writePartitionedFile(ShuffleBlockId(0, 0, 0), taskContext, outputFile) + } { + // Clean up + if (sc != null) { + sc.stop() + } + + // restore the spark env + SparkEnv.set(oldEnv) + + if (outputFile != null) { + outputFile.delete() + } + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala new file mode 100644 index 000000000000..5fdb82b06772 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.aggregate + +import org.apache.spark._ +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection +import org.apache.spark.sql.execution.metric.SQLMetrics +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.unsafe.memory.TaskMemoryManager + +class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLContext { + + test("memory acquired on construction") { + val taskMemoryManager = new TaskMemoryManager(SparkEnv.get.executorMemoryManager) + val taskContext = new TaskContextImpl(0, 0, 0, 0, taskMemoryManager, null, Seq.empty) + TaskContext.setTaskContext(taskContext) + + // Assert that a page is allocated before processing starts + var iter: TungstenAggregationIterator = null + try { + val newMutableProjection = (expr: Seq[Expression], schema: Seq[Attribute]) => { + () => new InterpretedMutableProjection(expr, schema) + } + val dummyAccum = SQLMetrics.createLongMetric(ctx.sparkContext, "dummy") + iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, 0, + Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum) + val numPages = iter.getHashMap.getNumDataPages + assert(numPages === 1) + } finally { + // Clean up + if (iter != null) { + iter.free() + } + TaskContext.unset() + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala similarity index 90% rename from sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index f19f22fca7d5..1174b27732f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -15,27 +15,25 @@ * limitations under the License. */ -package org.apache.spark.sql.json +package org.apache.spark.sql.execution.datasources.json -import java.io.StringWriter +import java.io.{File, StringWriter} import java.sql.{Date, Timestamp} import com.fasterxml.jackson.core.JsonFactory +import org.apache.spark.rdd.RDD import org.scalactic.Tolerance._ import org.apache.spark.sql.{QueryTest, Row, SQLConf} -import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.json.InferSchema.compatibleType +import org.apache.spark.sql.execution.datasources.{ResolvedDataSource, LogicalRelation} +import org.apache.spark.sql.execution.datasources.json.InferSchema.compatibleType +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.util.Utils -class JsonSuite extends QueryTest with TestJsonData { - - protected lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.sql - import ctx.implicits._ +class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { + import testImplicits._ test("Type promotion") { def checkTypePromotion(expected: Any, actual: Any) { @@ -73,8 +71,6 @@ class JsonSuite extends QueryTest with TestJsonData { val doubleNumber: Double = 1.7976931348623157E308d checkTypePromotion(doubleNumber.toDouble, enforceCorrectType(doubleNumber, DoubleType)) - checkTypePromotion( - Decimal(doubleNumber), enforceCorrectType(doubleNumber, DecimalType.SYSTEM_DEFAULT)) checkTypePromotion(DateTimeUtils.fromJavaTimestamp(new Timestamp(intNumber)), enforceCorrectType(intNumber, TimestampType)) @@ -150,7 +146,7 @@ class JsonSuite extends QueryTest with TestJsonData { // DoubleType checkDataType(DoubleType, DoubleType, DoubleType) - checkDataType(DoubleType, DecimalType.SYSTEM_DEFAULT, DecimalType.SYSTEM_DEFAULT) + checkDataType(DoubleType, DecimalType.SYSTEM_DEFAULT, DoubleType) checkDataType(DoubleType, StringType, StringType) checkDataType(DoubleType, ArrayType(IntegerType), StringType) checkDataType(DoubleType, StructType(Nil), StringType) @@ -241,7 +237,7 @@ class JsonSuite extends QueryTest with TestJsonData { val jsonDF = ctx.read.json(primitiveFieldAndType) val expectedSchema = StructType( - StructField("bigInteger", DecimalType.SYSTEM_DEFAULT, true) :: + StructField("bigInteger", DecimalType(20, 0), true) :: StructField("boolean", BooleanType, true) :: StructField("double", DoubleType, true) :: StructField("integer", LongType, true) :: @@ -271,7 +267,7 @@ class JsonSuite extends QueryTest with TestJsonData { val expectedSchema = StructType( StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) :: StructField("arrayOfArray2", ArrayType(ArrayType(DoubleType, true), true), true) :: - StructField("arrayOfBigInteger", ArrayType(DecimalType.SYSTEM_DEFAULT, true), true) :: + StructField("arrayOfBigInteger", ArrayType(DecimalType(21, 0), true), true) :: StructField("arrayOfBoolean", ArrayType(BooleanType, true), true) :: StructField("arrayOfDouble", ArrayType(DoubleType, true), true) :: StructField("arrayOfInteger", ArrayType(LongType, true), true) :: @@ -285,7 +281,7 @@ class JsonSuite extends QueryTest with TestJsonData { StructField("field3", StringType, true) :: Nil), true), true) :: StructField("struct", StructType( StructField("field1", BooleanType, true) :: - StructField("field2", DecimalType.SYSTEM_DEFAULT, true) :: Nil), true) :: + StructField("field2", DecimalType(20, 0), true) :: Nil), true) :: StructField("structWithArrayFields", StructType( StructField("field1", ArrayType(LongType, true), true) :: StructField("field2", ArrayType(StringType, true), true) :: Nil), true) :: Nil) @@ -386,7 +382,7 @@ class JsonSuite extends QueryTest with TestJsonData { val expectedSchema = StructType( StructField("num_bool", StringType, true) :: StructField("num_num_1", LongType, true) :: - StructField("num_num_2", DecimalType.SYSTEM_DEFAULT, true) :: + StructField("num_num_2", DoubleType, true) :: StructField("num_num_3", DoubleType, true) :: StructField("num_str", StringType, true) :: StructField("str_bool", StringType, true) :: Nil) @@ -398,11 +394,9 @@ class JsonSuite extends QueryTest with TestJsonData { checkAnswer( sql("select * from jsonTable"), Row("true", 11L, null, 1.1, "13.1", "str1") :: - Row("12", null, new java.math.BigDecimal("21474836470.9"), null, null, "true") :: - Row("false", 21474836470L, - new java.math.BigDecimal("92233720368547758070"), 100, "str1", "false") :: - Row(null, 21474836570L, - new java.math.BigDecimal("1.1"), 21474836470L, "92233720368547758070", null) :: Nil + Row("12", null, 21474836470.9, null, null, "true") :: + Row("false", 21474836470L, 92233720368547758070d, 100, "str1", "false") :: + Row(null, 21474836570L, 1.1, 21474836470L, "92233720368547758070", null) :: Nil ) // Number and Boolean conflict: resolve the type as number in this query. @@ -425,8 +419,8 @@ class JsonSuite extends QueryTest with TestJsonData { // Widening to DecimalType checkAnswer( sql("select num_num_2 + 1.3 from jsonTable where num_num_2 > 1.1"), - Row(BigDecimal("21474836472.2")) :: - Row(BigDecimal("92233720368547758071.3")) :: Nil + Row(21474836472.2) :: + Row(92233720368547758071.3) :: Nil ) // Widening to Double @@ -578,7 +572,7 @@ class JsonSuite extends QueryTest with TestJsonData { test("jsonFile should be based on JSONRelation") { val dir = Utils.createTempDir() dir.delete() - val path = dir.getCanonicalPath + val path = dir.getCanonicalFile.toURI.toString ctx.sparkContext.parallelize(1 to 100) .map(i => s"""{"a": 1, "b": "str$i"}""").saveAsTextFile(path) val jsonDF = ctx.read.option("samplingRatio", "0.49").json(path) @@ -591,14 +585,15 @@ class JsonSuite extends QueryTest with TestJsonData { assert( relation.isInstanceOf[JSONRelation], "The DataFrame returned by jsonFile should be based on JSONRelation.") - assert(relation.asInstanceOf[JSONRelation].path === Some(path)) + assert(relation.asInstanceOf[JSONRelation].paths === Array(path)) assert(relation.asInstanceOf[JSONRelation].samplingRatio === (0.49 +- 0.001)) val schema = StructType(StructField("a", LongType, true) :: Nil) val logicalRelation = - ctx.read.schema(schema).json(path).queryExecution.analyzed.asInstanceOf[LogicalRelation] + ctx.read.schema(schema).json(path) + .queryExecution.analyzed.asInstanceOf[LogicalRelation] val relationWithSchema = logicalRelation.relation.asInstanceOf[JSONRelation] - assert(relationWithSchema.path === Some(path)) + assert(relationWithSchema.paths === Array(path)) assert(relationWithSchema.schema === schema) assert(relationWithSchema.samplingRatio > 0.99) } @@ -611,7 +606,7 @@ class JsonSuite extends QueryTest with TestJsonData { val jsonDF = ctx.read.json(path) val expectedSchema = StructType( - StructField("bigInteger", DecimalType.SYSTEM_DEFAULT, true) :: + StructField("bigInteger", DecimalType(20, 0), true) :: StructField("boolean", BooleanType, true) :: StructField("double", DoubleType, true) :: StructField("integer", LongType, true) :: @@ -1040,26 +1035,35 @@ class JsonSuite extends QueryTest with TestJsonData { } test("JSONRelation equality test") { - val context = org.apache.spark.sql.test.TestSQLContext + val relation0 = new JSONRelation( + Some(empty), + 1.0, + Some(StructType(StructField("a", IntegerType, true) :: Nil)), + None, None)(ctx) + val logicalRelation0 = LogicalRelation(relation0) val relation1 = new JSONRelation( - "path", + Some(singleRow), 1.0, Some(StructType(StructField("a", IntegerType, true) :: Nil)), - context) + None, None)(ctx) val logicalRelation1 = LogicalRelation(relation1) val relation2 = new JSONRelation( - "path", + Some(singleRow), 0.5, Some(StructType(StructField("a", IntegerType, true) :: Nil)), - context) + None, None)(ctx) val logicalRelation2 = LogicalRelation(relation2) val relation3 = new JSONRelation( - "path", + Some(singleRow), 1.0, - Some(StructType(StructField("b", StringType, true) :: Nil)), - context) + Some(StructType(StructField("b", IntegerType, true) :: Nil)), + None, None)(ctx) val logicalRelation3 = LogicalRelation(relation3) + assert(relation0 !== relation1) + assert(!logicalRelation0.sameResult(logicalRelation1), + s"$logicalRelation0 and $logicalRelation1 should be considered not having the same result.") + assert(relation1 === relation2) assert(logicalRelation1.sameResult(logicalRelation2), s"$logicalRelation1 and $logicalRelation2 should be considered having the same result.") @@ -1071,6 +1075,27 @@ class JsonSuite extends QueryTest with TestJsonData { assert(relation2 !== relation3) assert(!logicalRelation2.sameResult(logicalRelation3), s"$logicalRelation2 and $logicalRelation3 should be considered not having the same result.") + + withTempPath(dir => { + val path = dir.getCanonicalFile.toURI.toString + ctx.sparkContext.parallelize(1 to 100) + .map(i => s"""{"a": 1, "b": "str$i"}""").saveAsTextFile(path) + + val d1 = ResolvedDataSource( + ctx, + userSpecifiedSchema = None, + partitionColumns = Array.empty[String], + provider = classOf[DefaultSource].getCanonicalName, + options = Map("path" -> path)) + + val d2 = ResolvedDataSource( + ctx, + userSpecifiedSchema = None, + partitionColumns = Array.empty[String], + provider = classOf[DefaultSource].getCanonicalName, + options = Map("path" -> path)) + assert(d1 === d2) + }) } test("SPARK-6245 JsonRDD.inferSchema on empty RDD") { @@ -1105,4 +1130,37 @@ class JsonSuite extends QueryTest with TestJsonData { val emptySchema = InferSchema(emptyRecords, 1.0, "") assert(StructType(Seq()) === emptySchema) } + + test("JSON with Partition") { + def makePartition(rdd: RDD[String], parent: File, partName: String, partValue: Any): File = { + val p = new File(parent, s"$partName=${partValue.toString}") + rdd.saveAsTextFile(p.getCanonicalPath) + p + } + + withTempPath(root => { + val d1 = new File(root, "d1=1") + // root/dt=1/col1=abc + val p1_col1 = makePartition( + ctx.sparkContext.parallelize(2 to 5).map(i => s"""{"a": 1, "b": "str$i"}"""), + d1, + "col1", + "abc") + + // root/dt=1/col1=abd + val p2 = makePartition( + ctx.sparkContext.parallelize(6 to 10).map(i => s"""{"a": 1, "b": "str$i"}"""), + d1, + "col1", + "abd") + + ctx.read.json(root.getAbsolutePath).registerTempTable("test_myjson_with_part") + checkAnswer(sql( + "SELECT count(a) FROM test_myjson_with_part where d1 = 1 and col1='abc'"), Row(4)) + checkAnswer(sql( + "SELECT count(a) FROM test_myjson_with_part where d1 = 1 and col1='abd'"), Row(5)) + checkAnswer(sql( + "SELECT count(a) FROM test_myjson_with_part where d1 = 1"), Row(9)) + }) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala similarity index 88% rename from sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala index eb62066ac643..2864181cf91d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala @@ -15,17 +15,16 @@ * limitations under the License. */ -package org.apache.spark.sql.json +package org.apache.spark.sql.execution.datasources.json import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext -trait TestJsonData { - - protected def ctx: SQLContext +private[json] trait TestJsonData { + protected def _sqlContext: SQLContext def primitiveFieldAndType: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"string":"this is a simple string.", "integer":10, "long":21474836470, @@ -36,7 +35,7 @@ trait TestJsonData { }""" :: Nil) def primitiveFieldValueTypeConflict: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"num_num_1":11, "num_num_2":null, "num_num_3": 1.1, "num_bool":true, "num_str":13.1, "str_bool":"str1"}""" :: """{"num_num_1":null, "num_num_2":21474836470.9, "num_num_3": null, @@ -47,14 +46,14 @@ trait TestJsonData { "num_bool":null, "num_str":92233720368547758070, "str_bool":null}""" :: Nil) def jsonNullStruct: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"nullstr":"","ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" :: """{"nullstr":"","ip":"27.31.100.29","headers":{}}""" :: """{"nullstr":"","ip":"27.31.100.29","headers":""}""" :: """{"nullstr":null,"ip":"27.31.100.29","headers":null}""" :: Nil) def complexFieldValueTypeConflict: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"num_struct":11, "str_array":[1, 2, 3], "array":[], "struct_array":[], "struct": {}}""" :: """{"num_struct":{"field":false}, "str_array":null, @@ -65,14 +64,14 @@ trait TestJsonData { "array":[7], "struct_array":{"field": true}, "struct": {"field": "str"}}""" :: Nil) def arrayElementTypeConflict: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"array1": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}], "array2": [{"field":214748364700}, {"field":1}]}""" :: """{"array3": [{"field":"str"}, {"field":1}]}""" :: """{"array3": [1, 2, 3]}""" :: Nil) def missingFields: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"a":true}""" :: """{"b":21474836470}""" :: """{"c":[33, 44]}""" :: @@ -80,7 +79,7 @@ trait TestJsonData { """{"e":"str"}""" :: Nil) def complexFieldAndType1: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"struct":{"field1": true, "field2": 92233720368547758070}, "structWithArrayFields":{"field1":[4, 5, 6], "field2":["str1", "str2"]}, "arrayOfString":["str1", "str2"], @@ -96,7 +95,7 @@ trait TestJsonData { }""" :: Nil) def complexFieldAndType2: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"arrayOfStruct":[{"field1": true, "field2": "str1"}, {"field1": false}, {"field3": null}], "complexArrayOfStruct": [ { @@ -150,7 +149,7 @@ trait TestJsonData { }""" :: Nil) def mapType1: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"map": {"a": 1}}""" :: """{"map": {"b": 2}}""" :: """{"map": {"c": 3}}""" :: @@ -158,7 +157,7 @@ trait TestJsonData { """{"map": {"e": null}}""" :: Nil) def mapType2: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"map": {"a": {"field1": [1, 2, 3, null]}}}""" :: """{"map": {"b": {"field2": 2}}}""" :: """{"map": {"c": {"field1": [], "field2": 4}}}""" :: @@ -167,21 +166,21 @@ trait TestJsonData { """{"map": {"f": {"field1": null}}}""" :: Nil) def nullsInArrays: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{"field1":[[null], [[["Test"]]]]}""" :: """{"field2":[null, [{"Test":1}]]}""" :: """{"field3":[[null], [{"Test":"2"}]]}""" :: """{"field4":[[null, [1,2,3]]]}""" :: Nil) def jsonArray: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """[{"a":"str_a_1"}]""" :: """[{"a":"str_a_2"}, {"b":"str_b_3"}]""" :: """{"b":"str_b_4", "a":"str_a_4", "c":"str_c_4"}""" :: """[]""" :: Nil) def corruptRecords: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{""" :: """""" :: """{"a":1, b:2}""" :: @@ -190,7 +189,7 @@ trait TestJsonData { """]""" :: Nil) def emptyRecords: RDD[String] = - ctx.sparkContext.parallelize( + _sqlContext.sparkContext.parallelize( """{""" :: """""" :: """{"a": {}}""" :: @@ -198,5 +197,8 @@ trait TestJsonData { """{"b": [{"c": {}}]}""" :: """]""" :: Nil) - def empty: RDD[String] = ctx.sparkContext.parallelize(Seq[String]()) + + lazy val singleRow: RDD[String] = _sqlContext.sparkContext.parallelize("""{"a":123}""" :: Nil) + + def empty: RDD[String] = _sqlContext.sparkContext.parallelize(Seq[String]()) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetWriter.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetWriter.scala new file mode 100644 index 000000000000..d05c6098dca0 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetWriter.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.conf +import org.apache.hadoop.fs.Path +import org.apache.parquet.hadoop.ParquetWriter +import org.apache.parquet.hadoop.api.WriteSupport +import org.apache.parquet.hadoop.api.WriteSupport.WriteContext +import org.apache.parquet.io.api.RecordConsumer +import org.apache.parquet.schema.{MessageType, MessageTypeParser} + +private[sql] object DirectParquetWriter { + type RecordBuilder = RecordConsumer => Unit + + /** + * A testing Parquet [[WriteSupport]] implementation used to write manually constructed Parquet + * records with arbitrary structures. + */ + private class DirectWriteSupport(schema: MessageType, metadata: Map[String, String]) + extends WriteSupport[RecordBuilder] { + + private var recordConsumer: RecordConsumer = _ + + override def init(configuration: conf.Configuration): WriteContext = { + new WriteContext(schema, metadata.asJava) + } + + override def write(buildRecord: RecordBuilder): Unit = { + recordConsumer.startMessage() + buildRecord(recordConsumer) + recordConsumer.endMessage() + } + + override def prepareForWrite(recordConsumer: RecordConsumer): Unit = { + this.recordConsumer = recordConsumer + } + } + + def writeDirect + (path: String, schema: String, metadata: Map[String, String] = Map.empty) + (f: ParquetWriter[RecordBuilder] => Unit): Unit = { + val messageType = MessageTypeParser.parseMessageType(schema) + val writeSupport = new DirectWriteSupport(messageType, metadata) + val parquetWriter = new ParquetWriter[RecordBuilder](new Path(path), writeSupport) + try f(parquetWriter) finally parquetWriter.close() + } + + def message(writer: ParquetWriter[RecordBuilder])(builder: RecordBuilder): Unit = { + writer.write(builder) + } + + def group(consumer: RecordConsumer)(f: => Unit): Unit = { + consumer.startGroup() + f + consumer.endGroup() + } + + def field(consumer: RecordConsumer, name: String, index: Int = 0)(f: => Unit): Unit = { + consumer.startField(name, index) + f + consumer.endField(name, index) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala new file mode 100644 index 000000000000..45db619567a2 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import java.nio.ByteBuffer +import java.util.{List => JList, Map => JMap} + +import scala.collection.JavaConverters.seqAsJavaListConverter +import scala.collection.JavaConverters.mapAsJavaMapConverter + +import org.apache.avro.Schema +import org.apache.avro.generic.IndexedRecord +import org.apache.hadoop.fs.Path +import org.apache.parquet.avro.AvroParquetWriter + +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.datasources.parquet.test.avro._ +import org.apache.spark.sql.test.SharedSQLContext + +class ParquetAvroCompatibilitySuite extends ParquetCompatibilityTest with SharedSQLContext { + private def withWriter[T <: IndexedRecord] + (path: String, schema: Schema) + (f: AvroParquetWriter[T] => Unit): Unit = { + logInfo( + s"""Writing Avro records with the following Avro schema into Parquet file: + | + |${schema.toString(true)} + """.stripMargin) + + val writer = new AvroParquetWriter[T](new Path(path), schema) + try f(writer) finally writer.close() + } + + test("required primitives") { + withTempPath { dir => + val path = dir.getCanonicalPath + + withWriter[AvroPrimitives](path, AvroPrimitives.getClassSchema) { writer => + (0 until 10).foreach { i => + writer.write( + AvroPrimitives.newBuilder() + .setBoolColumn(i % 2 == 0) + .setIntColumn(i) + .setLongColumn(i.toLong * 10) + .setFloatColumn(i.toFloat + 0.1f) + .setDoubleColumn(i.toDouble + 0.2d) + .setBinaryColumn(ByteBuffer.wrap(s"val_$i".getBytes("UTF-8"))) + .setStringColumn(s"val_$i") + .build()) + } + } + + logParquetSchema(path) + + checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i => + Row( + i % 2 == 0, + i, + i.toLong * 10, + i.toFloat + 0.1f, + i.toDouble + 0.2d, + s"val_$i".getBytes("UTF-8"), + s"val_$i") + }) + } + } + + test("optional primitives") { + withTempPath { dir => + val path = dir.getCanonicalPath + + withWriter[AvroOptionalPrimitives](path, AvroOptionalPrimitives.getClassSchema) { writer => + (0 until 10).foreach { i => + val record = if (i % 3 == 0) { + AvroOptionalPrimitives.newBuilder() + .setMaybeBoolColumn(null) + .setMaybeIntColumn(null) + .setMaybeLongColumn(null) + .setMaybeFloatColumn(null) + .setMaybeDoubleColumn(null) + .setMaybeBinaryColumn(null) + .setMaybeStringColumn(null) + .build() + } else { + AvroOptionalPrimitives.newBuilder() + .setMaybeBoolColumn(i % 2 == 0) + .setMaybeIntColumn(i) + .setMaybeLongColumn(i.toLong * 10) + .setMaybeFloatColumn(i.toFloat + 0.1f) + .setMaybeDoubleColumn(i.toDouble + 0.2d) + .setMaybeBinaryColumn(ByteBuffer.wrap(s"val_$i".getBytes("UTF-8"))) + .setMaybeStringColumn(s"val_$i") + .build() + } + + writer.write(record) + } + } + + logParquetSchema(path) + + checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i => + if (i % 3 == 0) { + Row.apply(Seq.fill(7)(null): _*) + } else { + Row( + i % 2 == 0, + i, + i.toLong * 10, + i.toFloat + 0.1f, + i.toDouble + 0.2d, + s"val_$i".getBytes("UTF-8"), + s"val_$i") + } + }) + } + } + + test("non-nullable arrays") { + withTempPath { dir => + val path = dir.getCanonicalPath + + withWriter[AvroNonNullableArrays](path, AvroNonNullableArrays.getClassSchema) { writer => + (0 until 10).foreach { i => + val record = { + val builder = + AvroNonNullableArrays.newBuilder() + .setStringsColumn(Seq.tabulate(3)(i => s"val_$i").asJava) + + if (i % 3 == 0) { + builder.setMaybeIntsColumn(null).build() + } else { + builder.setMaybeIntsColumn(Seq.tabulate(3)(Int.box).asJava).build() + } + } + + writer.write(record) + } + } + + logParquetSchema(path) + + checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i => + Row( + Seq.tabulate(3)(i => s"val_$i"), + if (i % 3 == 0) null else Seq.tabulate(3)(identity)) + }) + } + } + + ignore("nullable arrays (parquet-avro 1.7.0 does not properly support this)") { + // TODO Complete this test case after upgrading to parquet-mr 1.8+ + } + + test("SPARK-10136 array of primitive array") { + withTempPath { dir => + val path = dir.getCanonicalPath + + withWriter[AvroArrayOfArray](path, AvroArrayOfArray.getClassSchema) { writer => + (0 until 10).foreach { i => + writer.write(AvroArrayOfArray.newBuilder() + .setIntArraysColumn( + Seq.tabulate(3, 3)((i, j) => i * 3 + j: Integer).map(_.asJava).asJava) + .build()) + } + } + + logParquetSchema(path) + + checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i => + Row(Seq.tabulate(3, 3)((i, j) => i * 3 + j)) + }) + } + } + + test("map of primitive array") { + withTempPath { dir => + val path = dir.getCanonicalPath + + withWriter[AvroMapOfArray](path, AvroMapOfArray.getClassSchema) { writer => + (0 until 10).foreach { i => + writer.write(AvroMapOfArray.newBuilder() + .setStringToIntsColumn( + Seq.tabulate(3) { i => + i.toString -> Seq.tabulate(3)(j => i + j: Integer).asJava + }.toMap.asJava) + .build()) + } + } + + logParquetSchema(path) + + checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i => + Row(Seq.tabulate(3)(i => i.toString -> Seq.tabulate(3)(j => i + j)).toMap) + }) + } + } + + test("various complex types") { + withTempPath { dir => + val path = dir.getCanonicalPath + + withWriter[ParquetAvroCompat](path, ParquetAvroCompat.getClassSchema) { writer => + (0 until 10).foreach(i => writer.write(makeParquetAvroCompat(i))) + } + + logParquetSchema(path) + + checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i => + Row( + Seq.tabulate(3)(n => s"arr_${i + n}"), + Seq.tabulate(3)(n => n.toString -> (i + n: Integer)).toMap, + Seq.tabulate(3) { n => + (i + n).toString -> Seq.tabulate(3) { m => + Row(Seq.tabulate(3)(j => i + j + m), s"val_${i + m}") + } + }.toMap) + }) + } + } + + def makeParquetAvroCompat(i: Int): ParquetAvroCompat = { + def makeComplexColumn(i: Int): JMap[String, JList[Nested]] = { + Seq.tabulate(3) { n => + (i + n).toString -> Seq.tabulate(3) { m => + Nested + .newBuilder() + .setNestedIntsColumn(Seq.tabulate(3)(j => i + j + m: Integer).asJava) + .setNestedStringColumn(s"val_${i + m}") + .build() + }.asJava + }.toMap.asJava + } + + ParquetAvroCompat + .newBuilder() + .setStringsColumn(Seq.tabulate(3)(n => s"arr_${i + n}").asJava) + .setStringToIntColumn(Seq.tabulate(3)(n => n.toString -> (i + n: Integer)).toMap.asJava) + .setComplexColumn(makeComplexColumn(i)) + .build() + } + + test("SPARK-9407 Push down predicates involving Parquet ENUM columns") { + import testImplicits._ + + withTempPath { dir => + val path = dir.getCanonicalPath + + withWriter[ParquetEnum](path, ParquetEnum.getClassSchema) { writer => + (0 until 4).foreach { i => + writer.write(ParquetEnum.newBuilder().setSuit(Suit.values.apply(i)).build()) + } + } + + checkAnswer(sqlContext.read.parquet(path).filter('suit === "SPADES"), Row("SPADES")) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala similarity index 62% rename from sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala index b4cdfd9e98f6..d85c564e3e8d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala @@ -15,38 +15,41 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet -import java.io.File +package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConversions._ -import org.apache.hadoop.fs.Path +import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.schema.MessageType -import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.QueryTest -import org.apache.spark.util.Utils -abstract class ParquetCompatibilityTest extends QueryTest with ParquetTest with BeforeAndAfterAll { - protected var parquetStore: File = _ - - override protected def beforeAll(): Unit = { - parquetStore = Utils.createTempDir(namePrefix = "parquet-compat_") - parquetStore.delete() - } - - override protected def afterAll(): Unit = { - Utils.deleteRecursively(parquetStore) +/** + * Helper class for testing Parquet compatibility. + */ +private[sql] abstract class ParquetCompatibilityTest extends QueryTest with ParquetTest { + protected def readParquetSchema(path: String): MessageType = { + readParquetSchema(path, { path => !path.getName.startsWith("_") }) } - def readParquetSchema(path: String): MessageType = { + protected def readParquetSchema(path: String, pathFilter: Path => Boolean): MessageType = { val fsPath = new Path(path) val fs = fsPath.getFileSystem(configuration) - val parquetFiles = fs.listStatus(fsPath).toSeq.filterNot(_.getPath.getName.startsWith("_")) + val parquetFiles = fs.listStatus(fsPath, new PathFilter { + override def accept(path: Path): Boolean = pathFilter(path) + }).toSeq + val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true) footers.head.getParquetMetadata.getFileMetaData.getSchema } + + protected def logParquetSchema(path: String): Unit = { + logInfo( + s"""Schema of the Parquet file written by parquet-avro: + |${readParquetSchema(path)} + """.stripMargin) + } } object ParquetCompatibilityTest { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala similarity index 83% rename from sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index b6a7c4fbddbd..f067112cfca9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -15,17 +15,17 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import org.apache.parquet.filter2.predicate.Operators._ import org.apache.parquet.filter2.predicate.{FilterPredicate, Operators} +import org.apache.spark.sql.{Column, DataFrame, QueryTest, Row, SQLConf} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.types._ -import org.apache.spark.sql.{Column, DataFrame, QueryTest, Row, SQLConf} +import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, LogicalRelation} +import org.apache.spark.sql.test.SharedSQLContext /** * A test suite that tests Parquet filter2 API based filter pushdown optimization. @@ -39,8 +39,7 @@ import org.apache.spark.sql.{Column, DataFrame, QueryTest, Row, SQLConf} * 2. `Tuple1(Option(x))` is used together with `AnyVal` types like `Int` to ensure the inferred * data type is nullable. */ -class ParquetFilterSuite extends QueryTest with ParquetTest { - lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext +class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContext { private def checkFilterPredicate( df: DataFrame, @@ -55,20 +54,22 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { .select(output.map(e => Column(e)): _*) .where(Column(predicate)) - val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect { + val analyzedPredicate = query.queryExecution.optimizedPlan.collect { case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation)) => filters - }.flatten.reduceOption(_ && _) + }.flatten + assert(analyzedPredicate.nonEmpty) - assert(maybeAnalyzedPredicate.isDefined) - maybeAnalyzedPredicate.foreach { pred => - val maybeFilter = ParquetFilters.createFilter(pred) + val selectedFilters = DataSourceStrategy.selectFilters(analyzedPredicate) + assert(selectedFilters.nonEmpty) + + selectedFilters.foreach { pred => + val maybeFilter = ParquetFilters.createFilter(df.schema, pred) assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $pred") maybeFilter.foreach { f => // Doesn't bother checking type parameters here (e.g. `Eq[Integer]`) assert(f.getClass === filterClass) } } - checker(query, expected) } } @@ -109,43 +110,18 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], Seq(Row(true), Row(false))) checkFilterPredicate('_1 === true, classOf[Eq[_]], true) + checkFilterPredicate('_1 <=> true, classOf[Eq[_]], true) checkFilterPredicate('_1 !== true, classOf[NotEq[_]], false) } } - test("filter pushdown - short") { - withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit df => - checkFilterPredicate(Cast('_1, IntegerType) === 1, classOf[Eq[_]], 1) - checkFilterPredicate( - Cast('_1, IntegerType) !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - - checkFilterPredicate(Cast('_1, IntegerType) < 2, classOf[Lt[_]], 1) - checkFilterPredicate(Cast('_1, IntegerType) > 3, classOf[Gt[_]], 4) - checkFilterPredicate(Cast('_1, IntegerType) <= 1, classOf[LtEq[_]], 1) - checkFilterPredicate(Cast('_1, IntegerType) >= 4, classOf[GtEq[_]], 4) - - checkFilterPredicate(Literal(1) === Cast('_1, IntegerType), classOf[Eq[_]], 1) - checkFilterPredicate(Literal(2) > Cast('_1, IntegerType), classOf[Lt[_]], 1) - checkFilterPredicate(Literal(3) < Cast('_1, IntegerType), classOf[Gt[_]], 4) - checkFilterPredicate(Literal(1) >= Cast('_1, IntegerType), classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4) <= Cast('_1, IntegerType), classOf[GtEq[_]], 4) - - checkFilterPredicate(!(Cast('_1, IntegerType) < 4), classOf[GtEq[_]], 4) - checkFilterPredicate( - Cast('_1, IntegerType) > 2 && Cast('_1, IntegerType) < 4, classOf[Operators.And], 3) - checkFilterPredicate( - Cast('_1, IntegerType) < 2 || Cast('_1, IntegerType) > 3, - classOf[Operators.Or], - Seq(Row(1), Row(4))) - } - } - test("filter pushdown - integer") { withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df => checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) + checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1) checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) @@ -154,13 +130,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1) + checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1) checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1) checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4) checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) - checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3) checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) } } @@ -171,6 +147,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) + checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1) checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) @@ -179,13 +156,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1) + checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1) checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1) checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4) checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) - checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3) checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) } } @@ -196,6 +173,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) + checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1) checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) @@ -204,13 +182,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1) + checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1) checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1) checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4) checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) - checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3) checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) } } @@ -221,6 +199,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) + checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1) checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) @@ -229,13 +208,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1) + checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1) checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1) checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4) checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) - checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3) checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) } } @@ -247,6 +226,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString))) checkFilterPredicate('_1 === "1", classOf[Eq[_]], "1") + checkFilterPredicate('_1 <=> "1", classOf[Eq[_]], "1") checkFilterPredicate( '_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString))) @@ -256,13 +236,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkFilterPredicate('_1 >= "4", classOf[GtEq[_]], "4") checkFilterPredicate(Literal("1") === '_1, classOf[Eq[_]], "1") + checkFilterPredicate(Literal("1") <=> '_1, classOf[Eq[_]], "1") checkFilterPredicate(Literal("2") > '_1, classOf[Lt[_]], "1") checkFilterPredicate(Literal("3") < '_1, classOf[Gt[_]], "4") checkFilterPredicate(Literal("1") >= '_1, classOf[LtEq[_]], "1") checkFilterPredicate(Literal("4") <= '_1, classOf[GtEq[_]], "4") checkFilterPredicate(!('_1 < "4"), classOf[GtEq[_]], "4") - checkFilterPredicate('_1 > "2" && '_1 < "4", classOf[Operators.And], "3") checkFilterPredicate('_1 < "2" || '_1 > "3", classOf[Operators.Or], Seq(Row("1"), Row("4"))) } } @@ -274,6 +254,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { withParquetDataFrame((1 to 4).map(i => Tuple1(i.b))) { implicit df => checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq[_]], 1.b) + checkBinaryFilterPredicate('_1 <=> 1.b, classOf[Eq[_]], 1.b) checkBinaryFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) checkBinaryFilterPredicate( @@ -288,20 +269,20 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkBinaryFilterPredicate('_1 >= 4.b, classOf[GtEq[_]], 4.b) checkBinaryFilterPredicate(Literal(1.b) === '_1, classOf[Eq[_]], 1.b) + checkBinaryFilterPredicate(Literal(1.b) <=> '_1, classOf[Eq[_]], 1.b) checkBinaryFilterPredicate(Literal(2.b) > '_1, classOf[Lt[_]], 1.b) checkBinaryFilterPredicate(Literal(3.b) < '_1, classOf[Gt[_]], 4.b) checkBinaryFilterPredicate(Literal(1.b) >= '_1, classOf[LtEq[_]], 1.b) checkBinaryFilterPredicate(Literal(4.b) <= '_1, classOf[GtEq[_]], 4.b) checkBinaryFilterPredicate(!('_1 < 4.b), classOf[GtEq[_]], 4.b) - checkBinaryFilterPredicate('_1 > 2.b && '_1 < 4.b, classOf[Operators.And], 3.b) checkBinaryFilterPredicate( '_1 < 2.b || '_1 > 3.b, classOf[Operators.Or], Seq(Row(1.b), Row(4.b))) } } test("SPARK-6554: don't push down predicates which reference partition columns") { - import sqlContext.implicits._ + import testImplicits._ withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") { withTempPath { dir => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala similarity index 83% rename from sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index b415da5b8c13..e6b0a2ea95e3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConversions._ import scala.reflect.ClassTag @@ -37,6 +37,7 @@ import org.apache.spark.SparkException import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ // Write support class for nested groups: ParquetWriter initializes GroupWriteSupport @@ -62,9 +63,8 @@ private[parquet] class TestGroupWriteSupport(schema: MessageType) extends WriteS /** * A test suite that tests basic Parquet I/O. */ -class ParquetIOSuite extends QueryTest with ParquetTest { - lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext - import sqlContext.implicits._ +class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext { + import testImplicits._ /** * Writes `data` to a Parquet file, reads it back and check file contents. @@ -369,6 +369,30 @@ class ParquetIOSuite extends QueryTest with ParquetTest { test("SPARK-6352 DirectParquetOutputCommitter") { val clonedConf = new Configuration(configuration) + // Write to a parquet file and let it fail. + // _temporary should be missing if direct output committer works. + try { + configuration.set("spark.sql.parquet.output.committer.class", + classOf[DirectParquetOutputCommitter].getCanonicalName) + sqlContext.udf.register("div0", (x: Int) => x / 0) + withTempPath { dir => + intercept[org.apache.spark.SparkException] { + sqlContext.sql("select div0(1)").write.parquet(dir.getCanonicalPath) + } + val path = new Path(dir.getCanonicalPath, "_temporary") + val fs = path.getFileSystem(configuration) + assert(!fs.exists(path)) + } + } finally { + // Hadoop 1 doesn't have `Configuration.unset` + configuration.clear() + clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue)) + } + } + + test("SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible") { + val clonedConf = new Configuration(configuration) + // Write to a parquet file and let it fail. // _temporary should be missing if direct output committer works. try { @@ -390,7 +414,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest { } } - test("SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be overriden") { + + test("SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be overridden") { withTempPath { dir => val clonedConf = new Configuration(configuration) @@ -399,7 +424,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest { configuration.set( "spark.sql.parquet.output.committer.class", - classOf[BogusParquetOutputCommitter].getCanonicalName) + classOf[JobCommitFailureParquetOutputCommitter].getCanonicalName) try { val message = intercept[SparkException] { @@ -425,12 +450,54 @@ class ParquetIOSuite extends QueryTest with ParquetTest { }.toString assert(errorMessage.contains("UnknownHostException")) } + + test("SPARK-7837 Do not close output writer twice when commitTask() fails") { + val clonedConf = new Configuration(configuration) + + // Using a output committer that always fail when committing a task, so that both + // `commitTask()` and `abortTask()` are invoked. + configuration.set( + "spark.sql.parquet.output.committer.class", + classOf[TaskCommitFailureParquetOutputCommitter].getCanonicalName) + + try { + // Before fixing SPARK-7837, the following code results in an NPE because both + // `commitTask()` and `abortTask()` try to close output writers. + + withTempPath { dir => + val m1 = intercept[SparkException] { + sqlContext.range(1).coalesce(1).write.parquet(dir.getCanonicalPath) + }.getCause.getMessage + assert(m1.contains("Intentional exception for testing purposes")) + } + + withTempPath { dir => + val m2 = intercept[SparkException] { + val df = sqlContext.range(1).select('id as 'a, 'id as 'b).coalesce(1) + df.write.partitionBy("a").parquet(dir.getCanonicalPath) + }.getCause.getMessage + assert(m2.contains("Intentional exception for testing purposes")) + } + } finally { + // Hadoop 1 doesn't have `Configuration.unset` + configuration.clear() + clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue)) + } + } } -class BogusParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) +class JobCommitFailureParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { override def commitJob(jobContext: JobContext): Unit = { sys.error("Intentional exception for testing purposes") } } + +class TaskCommitFailureParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) + extends ParquetOutputCommitter(outputPath, context) { + + override def commitTask(context: TaskAttemptContext): Unit = { + sys.error("Intentional exception for testing purposes") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala new file mode 100644 index 000000000000..d17d9304efb2 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import java.io.File + +import org.apache.spark.sql.Row +import org.apache.spark.sql.test.SharedSQLContext + +class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedSQLContext { + test("parquet files with different physical schemas but share the same logical schema") { + // This test case writes two Parquet files, both representing the following Catalyst schema + // + // StructType( + // StructField( + // "f", + // ArrayType(IntegerType, containsNull = false), + // nullable = false)) + // + // The first Parquet file comes with parquet-avro style 2-level LIST-annotated repeated group, + // while the other one comes with parquet-protobuf style 1-level unannotated repeated primitive + // field. + withTempDir { dir => + import DirectParquetWriter._ + + val avroStylePath = new File(dir, "avro-style").getCanonicalPath + val protobufStylePath = new File(dir, "protobuf-style").getCanonicalPath + + val avroStyleSchema = + """message avro_style { + | required group f (LIST) { + | repeated int32 array; + | } + |} + """.stripMargin + + writeDirect(avroStylePath, avroStyleSchema) { writer => + message(writer) { rc => + field(rc, "f") { + group(rc) { + field(rc, "array") { + rc.addInteger(0) + rc.addInteger(1) + } + } + } + } + + message(writer) { rc => + field(rc, "f") { + group(rc) { + field(rc, "array") { + rc.addInteger(2) + rc.addInteger(3) + } + } + } + } + } + + logParquetSchema(avroStylePath) + + val protobufStyleSchema = + """message protobuf_style { + | repeated int32 f; + |} + """.stripMargin + + writeDirect(protobufStylePath, protobufStyleSchema) { writer => + message(writer) { rc => + field(rc, "f") { + rc.addInteger(4) + rc.addInteger(5) + } + } + + message(writer) { rc => + field(rc, "f") { + rc.addInteger(6) + rc.addInteger(7) + } + } + } + + logParquetSchema(protobufStylePath) + + checkAnswer( + sqlContext.read.parquet(dir.getCanonicalPath), + Seq( + Row(Seq(0, 1)), + Row(Seq(2, 3)), + Row(Seq(4, 5)), + Row(Seq(6, 7)))) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala similarity index 98% rename from sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala index 2eef10189f11..ed8bafb10c60 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import java.io.File import java.math.BigInteger @@ -26,13 +26,13 @@ import scala.collection.mutable.ArrayBuffer import com.google.common.io.Files import org.apache.hadoop.fs.Path +import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.execution.datasources.{LogicalRelation, PartitionSpec, Partition, PartitioningUtils} +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ -import org.apache.spark.sql._ import org.apache.spark.unsafe.types.UTF8String -import PartitioningUtils._ // The data where the partitioning key exists only in the directory structure. case class ParquetData(intField: Int, stringField: String) @@ -40,11 +40,9 @@ case class ParquetData(intField: Int, stringField: String) // The data that also includes the partitioning key case class ParquetDataWithKey(intField: Int, pi: Int, stringField: String, ps: String) -class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest { - - override lazy val sqlContext: SQLContext = org.apache.spark.sql.test.TestSQLContext - import sqlContext.implicits._ - import sqlContext.sql +class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with SharedSQLContext { + import PartitioningUtils._ + import testImplicits._ val defaultPartitionName = "__HIVE_DEFAULT_PARTITION__" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala new file mode 100644 index 000000000000..b290429c2a02 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.test.SharedSQLContext + +class ParquetProtobufCompatibilitySuite extends ParquetCompatibilityTest with SharedSQLContext { + + private def readParquetProtobufFile(name: String): DataFrame = { + val url = Thread.currentThread().getContextClassLoader.getResource(name) + sqlContext.read.parquet(url.toString) + } + + test("unannotated array of primitive type") { + checkAnswer(readParquetProtobufFile("old-repeated-int.parquet"), Row(Seq(1, 2, 3))) + } + + test("unannotated array of struct") { + checkAnswer( + readParquetProtobufFile("old-repeated-message.parquet"), + Row( + Seq( + Row("First inner", null, null), + Row(null, "Second inner", null), + Row(null, null, "Third inner")))) + + checkAnswer( + readParquetProtobufFile("proto-repeated-struct.parquet"), + Row( + Seq( + Row("0 - 1", "0 - 2", "0 - 3"), + Row("1 - 1", "1 - 2", "1 - 3")))) + + checkAnswer( + readParquetProtobufFile("proto-struct-with-array-many.parquet"), + Seq( + Row( + Seq( + Row("0 - 0 - 1", "0 - 0 - 2", "0 - 0 - 3"), + Row("0 - 1 - 1", "0 - 1 - 2", "0 - 1 - 3"))), + Row( + Seq( + Row("1 - 0 - 1", "1 - 0 - 2", "1 - 0 - 3"), + Row("1 - 1 - 1", "1 - 1 - 2", "1 - 1 - 3"))), + Row( + Seq( + Row("2 - 0 - 1", "2 - 0 - 2", "2 - 0 - 3"), + Row("2 - 1 - 1", "2 - 1 - 2", "2 - 1 - 3"))))) + } + + test("struct with unannotated array") { + checkAnswer( + readParquetProtobufFile("proto-struct-with-array.parquet"), + Row(10, 9, Seq.empty, null, Row(9), Seq(Row(9), Row(10)))) + } + + test("unannotated array of struct with unannotated array") { + checkAnswer( + readParquetProtobufFile("nested-array-struct.parquet"), + Seq( + Row(2, Seq(Row(1, Seq(Row(3))))), + Row(5, Seq(Row(4, Seq(Row(6))))), + Row(8, Seq(Row(7, Seq(Row(9))))))) + } + + test("unannotated array of string") { + checkAnswer( + readParquetProtobufFile("proto-repeated-string.parquet"), + Seq( + Row(Seq("hello", "world")), + Row(Seq("good", "bye")), + Row(Seq("one", "two", "three")))) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala new file mode 100644 index 000000000000..2cfa42d8cad9 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -0,0 +1,554 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import java.io.File + +import org.apache.hadoop.fs.Path + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow +import org.apache.spark.sql.execution.datasources.parquet.TestingUDT.{NestedStructUDT, NestedStruct} +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils + +/** + * A test suite that tests various Parquet queries. + */ +class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext { + + test("simple select queries") { + withParquetTable((0 until 10).map(i => (i, i.toString)), "t") { + checkAnswer(sql("SELECT _1 FROM t where t._1 > 5"), (6 until 10).map(Row.apply(_))) + checkAnswer(sql("SELECT _1 FROM t as tmp where tmp._1 < 5"), (0 until 5).map(Row.apply(_))) + } + } + + test("appending") { + val data = (0 until 10).map(i => (i, i.toString)) + ctx.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp") + withParquetTable(data, "t") { + sql("INSERT INTO TABLE t SELECT * FROM tmp") + checkAnswer(ctx.table("t"), (data ++ data).map(Row.fromTuple)) + } + ctx.catalog.unregisterTable(Seq("tmp")) + } + + test("overwriting") { + val data = (0 until 10).map(i => (i, i.toString)) + ctx.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp") + withParquetTable(data, "t") { + sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp") + checkAnswer(ctx.table("t"), data.map(Row.fromTuple)) + } + ctx.catalog.unregisterTable(Seq("tmp")) + } + + test("self-join") { + // 4 rows, cells of column 1 of row 2 and row 4 are null + val data = (1 to 4).map { i => + val maybeInt = if (i % 2 == 0) None else Some(i) + (maybeInt, i.toString) + } + + withParquetTable(data, "t") { + val selfJoin = sql("SELECT * FROM t x JOIN t y WHERE x._1 = y._1") + val queryOutput = selfJoin.queryExecution.analyzed.output + + assertResult(4, "Field count mismatches")(queryOutput.size) + assertResult(2, "Duplicated expression ID in query plan:\n $selfJoin") { + queryOutput.filter(_.name == "_1").map(_.exprId).size + } + + checkAnswer(selfJoin, List(Row(1, "1", 1, "1"), Row(3, "3", 3, "3"))) + } + } + + test("nested data - struct with array field") { + val data = (1 to 10).map(i => Tuple1((i, Seq("val_$i")))) + withParquetTable(data, "t") { + checkAnswer(sql("SELECT _1._2[0] FROM t"), data.map { + case Tuple1((_, Seq(string))) => Row(string) + }) + } + } + + test("nested data - array of struct") { + val data = (1 to 10).map(i => Tuple1(Seq(i -> "val_$i"))) + withParquetTable(data, "t") { + checkAnswer(sql("SELECT _1[0]._2 FROM t"), data.map { + case Tuple1(Seq((_, string))) => Row(string) + }) + } + } + + test("SPARK-1913 regression: columns only referenced by pushed down filters should remain") { + withParquetTable((1 to 10).map(Tuple1.apply), "t") { + checkAnswer(sql("SELECT _1 FROM t WHERE _1 < 10"), (1 to 9).map(Row.apply(_))) + } + } + + test("SPARK-5309 strings stored using dictionary compression in parquet") { + withParquetTable((0 until 1000).map(i => ("same", "run_" + i /100, 1)), "t") { + + checkAnswer(sql("SELECT _1, _2, SUM(_3) FROM t GROUP BY _1, _2"), + (0 until 10).map(i => Row("same", "run_" + i, 100))) + + checkAnswer(sql("SELECT _1, _2, SUM(_3) FROM t WHERE _2 = 'run_5' GROUP BY _1, _2"), + List(Row("same", "run_5", 100))) + } + } + + test("SPARK-6917 DecimalType should work with non-native types") { + val data = (1 to 10).map(i => Row(Decimal(i, 18, 0), new java.sql.Timestamp(i))) + val schema = StructType(List(StructField("d", DecimalType(18, 0), false), + StructField("time", TimestampType, false)).toArray) + withTempPath { file => + val df = ctx.createDataFrame(ctx.sparkContext.parallelize(data), schema) + df.write.parquet(file.getCanonicalPath) + val df2 = ctx.read.parquet(file.getCanonicalPath) + checkAnswer(df2, df.collect().toSeq) + } + } + + test("Enabling/disabling merging partfiles when merging parquet schema") { + def testSchemaMerging(expectedColumnNumber: Int): Unit = { + withTempDir { dir => + val basePath = dir.getCanonicalPath + ctx.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) + ctx.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString) + // delete summary files, so if we don't merge part-files, one column will not be included. + Utils.deleteRecursively(new File(basePath + "/foo=1/_metadata")) + Utils.deleteRecursively(new File(basePath + "/foo=1/_common_metadata")) + assert(ctx.read.parquet(basePath).columns.length === expectedColumnNumber) + } + } + + withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true", + SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES.key -> "true") { + testSchemaMerging(2) + } + + withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true", + SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES.key -> "false") { + testSchemaMerging(3) + } + } + + test("Enabling/disabling schema merging") { + def testSchemaMerging(expectedColumnNumber: Int): Unit = { + withTempDir { dir => + val basePath = dir.getCanonicalPath + ctx.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) + ctx.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString) + assert(ctx.read.parquet(basePath).columns.length === expectedColumnNumber) + } + } + + withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true") { + testSchemaMerging(3) + } + + withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "false") { + testSchemaMerging(2) + } + } + + test("SPARK-8990 DataFrameReader.parquet() should respect user specified options") { + withTempPath { dir => + val basePath = dir.getCanonicalPath + ctx.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) + ctx.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=a").toString) + + // Disables the global SQL option for schema merging + withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "false") { + assertResult(2) { + // Disables schema merging via data source option + ctx.read.option("mergeSchema", "false").parquet(basePath).columns.length + } + + assertResult(3) { + // Enables schema merging via data source option + ctx.read.option("mergeSchema", "true").parquet(basePath).columns.length + } + } + } + } + + test("SPARK-9119 Decimal should be correctly written into parquet") { + withTempPath { dir => + val basePath = dir.getCanonicalPath + val schema = StructType(Array(StructField("name", DecimalType(10, 5), false))) + val rowRDD = sqlContext.sparkContext.parallelize(Array(Row(Decimal("67123.45")))) + val df = sqlContext.createDataFrame(rowRDD, schema) + df.write.parquet(basePath) + + val decimal = sqlContext.read.parquet(basePath).first().getDecimal(0) + assert(Decimal("67123.45") === Decimal(decimal)) + } + } + + test("SPARK-10005 Schema merging for nested struct") { + val sqlContext = _sqlContext + import sqlContext.implicits._ + + withTempPath { dir => + val path = dir.getCanonicalPath + + def append(df: DataFrame): Unit = { + df.write.mode(SaveMode.Append).parquet(path) + } + + // Note that both the following two DataFrames contain a single struct column with multiple + // nested fields. + append((1 to 2).map(i => Tuple1((i, i))).toDF()) + append((1 to 2).map(i => Tuple1((i, i, i))).toDF()) + + withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING.key -> "true") { + checkAnswer( + sqlContext.read.option("mergeSchema", "true").parquet(path), + Seq( + Row(Row(1, 1, null)), + Row(Row(2, 2, null)), + Row(Row(1, 1, 1)), + Row(Row(2, 2, 2)))) + } + } + } + + test("SPARK-10301 requested schema clipping - same schema") { + withTempPath { dir => + val path = dir.getCanonicalPath + val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s").coalesce(1) + df.write.parquet(path) + + val userDefinedSchema = + new StructType() + .add( + "s", + new StructType() + .add("a", LongType, nullable = true) + .add("b", LongType, nullable = true), + nullable = true) + + checkAnswer( + sqlContext.read.schema(userDefinedSchema).parquet(path), + Row(Row(0L, 1L))) + } + } + + // This test case is ignored because of parquet-mr bug PARQUET-370 + ignore("SPARK-10301 requested schema clipping - schemas with disjoint sets of fields") { + withTempPath { dir => + val path = dir.getCanonicalPath + val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s").coalesce(1) + df.write.parquet(path) + + val userDefinedSchema = + new StructType() + .add( + "s", + new StructType() + .add("c", LongType, nullable = true) + .add("d", LongType, nullable = true), + nullable = true) + + checkAnswer( + sqlContext.read.schema(userDefinedSchema).parquet(path), + Row(Row(null, null))) + } + } + + test("SPARK-10301 requested schema clipping - requested schema contains physical schema") { + withTempPath { dir => + val path = dir.getCanonicalPath + val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s").coalesce(1) + df.write.parquet(path) + + val userDefinedSchema = + new StructType() + .add( + "s", + new StructType() + .add("a", LongType, nullable = true) + .add("b", LongType, nullable = true) + .add("c", LongType, nullable = true) + .add("d", LongType, nullable = true), + nullable = true) + + checkAnswer( + sqlContext.read.schema(userDefinedSchema).parquet(path), + Row(Row(0L, 1L, null, null))) + } + + withTempPath { dir => + val path = dir.getCanonicalPath + val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'd', id + 3) AS s").coalesce(1) + df.write.parquet(path) + + val userDefinedSchema = + new StructType() + .add( + "s", + new StructType() + .add("a", LongType, nullable = true) + .add("b", LongType, nullable = true) + .add("c", LongType, nullable = true) + .add("d", LongType, nullable = true), + nullable = true) + + checkAnswer( + sqlContext.read.schema(userDefinedSchema).parquet(path), + Row(Row(0L, null, null, 3L))) + } + } + + test("SPARK-10301 requested schema clipping - physical schema contains requested schema") { + withTempPath { dir => + val path = dir.getCanonicalPath + val df = sqlContext + .range(1) + .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2, 'd', id + 3) AS s") + .coalesce(1) + + df.write.parquet(path) + + val userDefinedSchema = + new StructType() + .add( + "s", + new StructType() + .add("a", LongType, nullable = true) + .add("b", LongType, nullable = true), + nullable = true) + + checkAnswer( + sqlContext.read.schema(userDefinedSchema).parquet(path), + Row(Row(0L, 1L))) + } + + withTempPath { dir => + val path = dir.getCanonicalPath + val df = sqlContext + .range(1) + .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2, 'd', id + 3) AS s") + .coalesce(1) + + df.write.parquet(path) + + val userDefinedSchema = + new StructType() + .add( + "s", + new StructType() + .add("a", LongType, nullable = true) + .add("d", LongType, nullable = true), + nullable = true) + + checkAnswer( + sqlContext.read.schema(userDefinedSchema).parquet(path), + Row(Row(0L, 3L))) + } + } + + test("SPARK-10301 requested schema clipping - schemas overlap but don't contain each other") { + withTempPath { dir => + val path = dir.getCanonicalPath + val df = sqlContext + .range(1) + .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2) AS s") + .coalesce(1) + + df.write.parquet(path) + + val userDefinedSchema = + new StructType() + .add( + "s", + new StructType() + .add("b", LongType, nullable = true) + .add("c", LongType, nullable = true) + .add("d", LongType, nullable = true), + nullable = true) + + checkAnswer( + sqlContext.read.schema(userDefinedSchema).parquet(path), + Row(Row(1L, 2L, null))) + } + } + + test("SPARK-10301 requested schema clipping - deeply nested struct") { + withTempPath { dir => + val path = dir.getCanonicalPath + + val df = sqlContext + .range(1) + .selectExpr("NAMED_STRUCT('a', ARRAY(NAMED_STRUCT('b', id, 'c', id))) AS s") + .coalesce(1) + + df.write.parquet(path) + + val userDefinedSchema = new StructType() + .add("s", + new StructType() + .add( + "a", + ArrayType( + new StructType() + .add("b", LongType, nullable = true) + .add("d", StringType, nullable = true), + containsNull = true), + nullable = true), + nullable = true) + + checkAnswer( + sqlContext.read.schema(userDefinedSchema).parquet(path), + Row(Row(Seq(Row(0, null))))) + } + } + + test("SPARK-10301 requested schema clipping - out of order") { + withTempPath { dir => + val path = dir.getCanonicalPath + + val df1 = sqlContext + .range(1) + .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2) AS s") + .coalesce(1) + + val df2 = sqlContext + .range(1, 2) + .selectExpr("NAMED_STRUCT('c', id + 2, 'b', id + 1, 'd', id + 3) AS s") + .coalesce(1) + + df1.write.parquet(path) + df2.write.mode(SaveMode.Append).parquet(path) + + val userDefinedSchema = new StructType() + .add("s", + new StructType() + .add("a", LongType, nullable = true) + .add("b", LongType, nullable = true) + .add("d", LongType, nullable = true), + nullable = true) + + checkAnswer( + sqlContext.read.schema(userDefinedSchema).parquet(path), + Seq( + Row(Row(0, 1, null)), + Row(Row(null, 2, 4)))) + } + } + + test("SPARK-10301 requested schema clipping - schema merging") { + withTempPath { dir => + val path = dir.getCanonicalPath + + val df1 = sqlContext + .range(1) + .selectExpr("NAMED_STRUCT('a', id, 'c', id + 2) AS s") + .coalesce(1) + + val df2 = sqlContext + .range(1, 2) + .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2) AS s") + .coalesce(1) + + df1.write.mode(SaveMode.Append).parquet(path) + df2.write.mode(SaveMode.Append).parquet(path) + + checkAnswer( + sqlContext + .read + .option("mergeSchema", "true") + .parquet(path) + .selectExpr("s.a", "s.b", "s.c"), + Seq( + Row(0, null, 2), + Row(1, 2, 3))) + } + } + + test("SPARK-10301 requested schema clipping - UDT") { + withTempPath { dir => + val path = dir.getCanonicalPath + + val df = sqlContext + .range(1) + .selectExpr( + """NAMED_STRUCT( + | 'f0', CAST(id AS STRING), + | 'f1', NAMED_STRUCT( + | 'a', CAST(id + 1 AS INT), + | 'b', CAST(id + 2 AS LONG), + | 'c', CAST(id + 3.5 AS DOUBLE) + | ) + |) AS s + """.stripMargin) + .coalesce(1) + + df.write.mode(SaveMode.Append).parquet(path) + + val userDefinedSchema = + new StructType() + .add( + "s", + new StructType() + .add("f1", new NestedStructUDT, nullable = true), + nullable = true) + + checkAnswer( + sqlContext.read.schema(userDefinedSchema).parquet(path), + Row(Row(NestedStruct(1, 2L, 3.5D)))) + } + } +} + +object TestingUDT { + @SQLUserDefinedType(udt = classOf[NestedStructUDT]) + case class NestedStruct(a: Integer, b: Long, c: Double) + + class NestedStructUDT extends UserDefinedType[NestedStruct] { + override def sqlType: DataType = + new StructType() + .add("a", IntegerType, nullable = true) + .add("b", LongType, nullable = false) + .add("c", DoubleType, nullable = false) + + override def serialize(obj: Any): Any = { + val row = new SpecificMutableRow(sqlType.asInstanceOf[StructType].map(_.dataType)) + obj match { + case n: NestedStruct => + row.setInt(0, n.a) + row.setLong(1, n.b) + row.setDouble(2, n.c) + } + } + + override def userClass: Class[NestedStruct] = classOf[NestedStruct] + + override def deserialize(datum: Any): NestedStruct = { + datum match { + case row: InternalRow => + NestedStruct(row.getInt(0), row.getLong(1), row.getDouble(2)) + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala similarity index 64% rename from sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index 4a0b3b60f419..eb7192d40046 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala @@ -15,20 +15,18 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag import org.apache.parquet.schema.MessageTypeParser -import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.ScalaReflection -import org.apache.spark.sql.test.TestSQLContext +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ -abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { - val sqlContext = TestSQLContext +abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext { /** * Checks whether the reflected Parquet message type for product type `T` conforms `messageType`. @@ -198,7 +196,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |message root { | optional group _1 (LIST) { | repeated group bag { - | optional int32 array_element; + | optional int32 array; | } | } |} @@ -267,7 +265,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | optional binary _1 (UTF8); | optional group _2 (LIST) { | repeated group bag { - | optional group array_element { + | optional group array { | required int32 _1; | required double _2; | } @@ -585,6 +583,36 @@ class ParquetSchemaSuite extends ParquetSchemaTest { |} """.stripMargin) + testParquetToCatalyst( + "Backwards-compatibility: LIST with non-nullable element type 7 - " + + "parquet-protobuf primitive lists", + new StructType() + .add("f1", ArrayType(IntegerType, containsNull = false), nullable = false), + """message root { + | repeated int32 f1; + |} + """.stripMargin) + + testParquetToCatalyst( + "Backwards-compatibility: LIST with non-nullable element type 8 - " + + "parquet-protobuf non-primitive lists", + { + val elementType = + new StructType() + .add("c1", StringType, nullable = true) + .add("c2", IntegerType, nullable = false) + + new StructType() + .add("f1", ArrayType(elementType, containsNull = false), nullable = false) + }, + """message root { + | repeated group f1 { + | optional binary c1 (UTF8); + | required int32 c2; + | } + |} + """.stripMargin) + // ======================================================= // Tests for converting Catalyst ArrayType to Parquet LIST // ======================================================= @@ -616,7 +644,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | optional group f1 (LIST) { | repeated group bag { - | optional int32 array_element; + | optional int32 array; | } | } |} @@ -913,4 +941,531 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional fixed_len_byte_array(8) f1 (DECIMAL(18, 3)); |} """.stripMargin) + + private def testSchemaClipping( + testName: String, + parquetSchema: String, + catalystSchema: StructType, + expectedSchema: String): Unit = { + test(s"Clipping - $testName") { + val expected = MessageTypeParser.parseMessageType(expectedSchema) + val actual = CatalystReadSupport.clipParquetSchema( + MessageTypeParser.parseMessageType(parquetSchema), catalystSchema) + + try { + expected.checkContains(actual) + actual.checkContains(expected) + } catch { case cause: Throwable => + fail( + s"""Expected clipped schema: + |$expected + |Actual clipped schema: + |$actual + """.stripMargin, + cause) + } + } + } + + testSchemaClipping( + "simple nested struct", + + parquetSchema = + """message root { + | required group f0 { + | optional int32 f00; + | optional int32 f01; + | } + |} + """.stripMargin, + + catalystSchema = { + val f0Type = new StructType().add("f00", IntegerType, nullable = true) + new StructType() + .add("f0", f0Type, nullable = false) + .add("f1", IntegerType, nullable = true) + }, + + expectedSchema = + """message root { + | required group f0 { + | optional int32 f00; + | } + | optional int32 f1; + |} + """.stripMargin) + + testSchemaClipping( + "parquet-protobuf style array", + + parquetSchema = + """message root { + | required group f0 { + | repeated binary f00 (UTF8); + | repeated group f01 { + | optional int32 f010; + | optional double f011; + | } + | } + |} + """.stripMargin, + + catalystSchema = { + val f00Type = ArrayType(StringType, containsNull = false) + val f01Type = ArrayType( + new StructType() + .add("f011", DoubleType, nullable = true), + containsNull = false) + + val f0Type = new StructType() + .add("f00", f00Type, nullable = false) + .add("f01", f01Type, nullable = false) + val f1Type = ArrayType(IntegerType, containsNull = true) + + new StructType() + .add("f0", f0Type, nullable = false) + .add("f1", f1Type, nullable = true) + }, + + expectedSchema = + """message root { + | required group f0 { + | repeated binary f00 (UTF8); + | repeated group f01 { + | optional double f011; + | } + | } + | + | optional group f1 (LIST) { + | repeated group list { + | optional int32 element; + | } + | } + |} + """.stripMargin) + + testSchemaClipping( + "parquet-thrift style array", + + parquetSchema = + """message root { + | required group f0 { + | optional group f00 (LIST) { + | repeated binary f00_tuple (UTF8); + | } + | + | optional group f01 (LIST) { + | repeated group f01_tuple { + | optional int32 f010; + | optional double f011; + | } + | } + | } + |} + """.stripMargin, + + catalystSchema = { + val f01ElementType = new StructType() + .add("f011", DoubleType, nullable = true) + .add("f012", LongType, nullable = true) + + val f0Type = new StructType() + .add("f00", ArrayType(StringType, containsNull = false), nullable = true) + .add("f01", ArrayType(f01ElementType, containsNull = false), nullable = true) + + new StructType().add("f0", f0Type, nullable = false) + }, + + expectedSchema = + """message root { + | required group f0 { + | optional group f00 (LIST) { + | repeated binary f00_tuple (UTF8); + | } + | + | optional group f01 (LIST) { + | repeated group f01_tuple { + | optional double f011; + | optional int64 f012; + | } + | } + | } + |} + """.stripMargin) + + testSchemaClipping( + "parquet-avro style array", + + parquetSchema = + """message root { + | required group f0 { + | optional group f00 (LIST) { + | repeated binary array (UTF8); + | } + | + | optional group f01 (LIST) { + | repeated group array { + | optional int32 f010; + | optional double f011; + | } + | } + | } + |} + """.stripMargin, + + catalystSchema = { + val f01ElementType = new StructType() + .add("f011", DoubleType, nullable = true) + .add("f012", LongType, nullable = true) + + val f0Type = new StructType() + .add("f00", ArrayType(StringType, containsNull = false), nullable = true) + .add("f01", ArrayType(f01ElementType, containsNull = false), nullable = true) + + new StructType().add("f0", f0Type, nullable = false) + }, + + expectedSchema = + """message root { + | required group f0 { + | optional group f00 (LIST) { + | repeated binary array (UTF8); + | } + | + | optional group f01 (LIST) { + | repeated group array { + | optional double f011; + | optional int64 f012; + | } + | } + | } + |} + """.stripMargin) + + testSchemaClipping( + "parquet-hive style array", + + parquetSchema = + """message root { + | optional group f0 { + | optional group f00 (LIST) { + | repeated group bag { + | optional binary array_element; + | } + | } + | + | optional group f01 (LIST) { + | repeated group bag { + | optional group array_element { + | optional int32 f010; + | optional double f011; + | } + | } + | } + | } + |} + """.stripMargin, + + catalystSchema = { + val f01ElementType = new StructType() + .add("f011", DoubleType, nullable = true) + .add("f012", LongType, nullable = true) + + val f0Type = new StructType() + .add("f00", ArrayType(StringType, containsNull = true), nullable = true) + .add("f01", ArrayType(f01ElementType, containsNull = true), nullable = true) + + new StructType().add("f0", f0Type, nullable = true) + }, + + expectedSchema = + """message root { + | optional group f0 { + | optional group f00 (LIST) { + | repeated group bag { + | optional binary array_element; + | } + | } + | + | optional group f01 (LIST) { + | repeated group bag { + | optional group array_element { + | optional double f011; + | optional int64 f012; + | } + | } + | } + | } + |} + """.stripMargin) + + testSchemaClipping( + "2-level list of required struct", + + parquetSchema = + s"""message root { + | required group f0 { + | required group f00 (LIST) { + | repeated group element { + | required int32 f000; + | optional int64 f001; + | } + | } + | } + |} + """.stripMargin, + + catalystSchema = { + val f00ElementType = + new StructType() + .add("f001", LongType, nullable = true) + .add("f002", DoubleType, nullable = false) + + val f00Type = ArrayType(f00ElementType, containsNull = false) + val f0Type = new StructType().add("f00", f00Type, nullable = false) + + new StructType().add("f0", f0Type, nullable = false) + }, + + expectedSchema = + s"""message root { + | required group f0 { + | required group f00 (LIST) { + | repeated group element { + | optional int64 f001; + | required double f002; + | } + | } + | } + |} + """.stripMargin) + + testSchemaClipping( + "standard array", + + parquetSchema = + """message root { + | required group f0 { + | optional group f00 (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | + | optional group f01 (LIST) { + | repeated group list { + | required group element { + | optional int32 f010; + | optional double f011; + | } + | } + | } + | } + |} + """.stripMargin, + + catalystSchema = { + val f01ElementType = new StructType() + .add("f011", DoubleType, nullable = true) + .add("f012", LongType, nullable = true) + + val f0Type = new StructType() + .add("f00", ArrayType(StringType, containsNull = false), nullable = true) + .add("f01", ArrayType(f01ElementType, containsNull = false), nullable = true) + + new StructType().add("f0", f0Type, nullable = false) + }, + + expectedSchema = + """message root { + | required group f0 { + | optional group f00 (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | + | optional group f01 (LIST) { + | repeated group list { + | required group element { + | optional double f011; + | optional int64 f012; + | } + | } + | } + | } + |} + """.stripMargin) + + testSchemaClipping( + "empty requested schema", + + parquetSchema = + """message root { + | required group f0 { + | required int32 f00; + | required int64 f01; + | } + |} + """.stripMargin, + + catalystSchema = new StructType(), + + expectedSchema = "message root {}") + + testSchemaClipping( + "disjoint field sets", + + parquetSchema = + """message root { + | required group f0 { + | required int32 f00; + | required int64 f01; + | } + |} + """.stripMargin, + + catalystSchema = + new StructType() + .add( + "f0", + new StructType() + .add("f02", FloatType, nullable = true) + .add("f03", DoubleType, nullable = true), + nullable = true), + + expectedSchema = + """message root { + | required group f0 { + | optional float f02; + | optional double f03; + | } + |} + """.stripMargin) + + testSchemaClipping( + "parquet-avro style map", + + parquetSchema = + """message root { + | required group f0 (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required int32 key; + | required group value { + | required int32 value_f0; + | required int64 value_f1; + | } + | } + | } + |} + """.stripMargin, + + catalystSchema = { + val valueType = + new StructType() + .add("value_f1", LongType, nullable = false) + .add("value_f2", DoubleType, nullable = false) + + val f0Type = MapType(IntegerType, valueType, valueContainsNull = false) + + new StructType().add("f0", f0Type, nullable = false) + }, + + expectedSchema = + """message root { + | required group f0 (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required int32 key; + | required group value { + | required int64 value_f1; + | required double value_f2; + | } + | } + | } + |} + """.stripMargin) + + testSchemaClipping( + "standard map", + + parquetSchema = + """message root { + | required group f0 (MAP) { + | repeated group key_value { + | required int32 key; + | required group value { + | required int32 value_f0; + | required int64 value_f1; + | } + | } + | } + |} + """.stripMargin, + + catalystSchema = { + val valueType = + new StructType() + .add("value_f1", LongType, nullable = false) + .add("value_f2", DoubleType, nullable = false) + + val f0Type = MapType(IntegerType, valueType, valueContainsNull = false) + + new StructType().add("f0", f0Type, nullable = false) + }, + + expectedSchema = + """message root { + | required group f0 (MAP) { + | repeated group key_value { + | required int32 key; + | required group value { + | required int64 value_f1; + | required double value_f2; + | } + | } + | } + |} + """.stripMargin) + + testSchemaClipping( + "standard map with complex key", + + parquetSchema = + """message root { + | required group f0 (MAP) { + | repeated group key_value { + | required group key { + | required int32 value_f0; + | required int64 value_f1; + | } + | required int32 value; + | } + | } + |} + """.stripMargin, + + catalystSchema = { + val keyType = + new StructType() + .add("value_f1", LongType, nullable = false) + .add("value_f2", DoubleType, nullable = false) + + val f0Type = MapType(keyType, IntegerType, valueContainsNull = false) + + new StructType().add("f0", f0Type, nullable = false) + }, + + expectedSchema = + """message root { + | required group f0 (MAP) { + | repeated group key_value { + | required group key { + | required int64 value_f1; + | required double value_f2; + | } + | required int32 value; + | } + | } + |} + """.stripMargin) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala similarity index 86% rename from sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala index 64e94056f209..5dbc7d1630f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala @@ -15,16 +15,15 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet import java.io.File import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag -import org.apache.spark.SparkFunSuite import org.apache.spark.sql.test.SQLTestUtils -import org.apache.spark.sql.{DataFrame, SaveMode} +import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext} /** * A helper trait that provides convenient facilities for Parquet testing. @@ -33,7 +32,9 @@ import org.apache.spark.sql.{DataFrame, SaveMode} * convenient to use tuples rather than special case classes when writing test cases/suites. * Especially, `Tuple1.apply` can be used to easily wrap a single type/value. */ -private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite => +private[sql] trait ParquetTest extends SQLTestUtils { + protected def _sqlContext: SQLContext + /** * Writes `data` to a Parquet file, which is then passed to `f` and will be deleted after `f` * returns. @@ -42,7 +43,7 @@ private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite => (data: Seq[T]) (f: String => Unit): Unit = { withTempPath { file => - sqlContext.createDataFrame(data).write.parquet(file.getCanonicalPath) + _sqlContext.createDataFrame(data).write.parquet(file.getCanonicalPath) f(file.getCanonicalPath) } } @@ -54,7 +55,7 @@ private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite => protected def withParquetDataFrame[T <: Product: ClassTag: TypeTag] (data: Seq[T]) (f: DataFrame => Unit): Unit = { - withParquetFile(data)(path => f(sqlContext.read.parquet(path))) + withParquetFile(data)(path => f(_sqlContext.read.parquet(path))) } /** @@ -66,14 +67,14 @@ private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite => (data: Seq[T], tableName: String) (f: => Unit): Unit = { withParquetDataFrame(data) { df => - sqlContext.registerDataFrameAsTable(df, tableName) + _sqlContext.registerDataFrameAsTable(df, tableName) withTempTable(tableName)(f) } } protected def makeParquetFile[T <: Product: ClassTag: TypeTag]( data: Seq[T], path: File): Unit = { - sqlContext.createDataFrame(data).write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath) + _sqlContext.createDataFrame(data).write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath) } protected def makeParquetFile[T <: Product: ClassTag: TypeTag]( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala similarity index 93% rename from sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala index 1c532d78790d..b789c5a106e5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala @@ -15,16 +15,14 @@ * limitations under the License. */ -package org.apache.spark.sql.parquet +package org.apache.spark.sql.execution.datasources.parquet -import org.apache.spark.sql.test.TestSQLContext -import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.Row +import org.apache.spark.sql.test.SharedSQLContext -class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest { +class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest with SharedSQLContext { import ParquetCompatibilityTest._ - override val sqlContext: SQLContext = TestSQLContext - private val parquetFilePath = Thread.currentThread().getContextClassLoader.getResource("parquet-thrift-compat.snappy.parquet") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala index 8ec3985e0036..22189477d277 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala @@ -18,15 +18,11 @@ package org.apache.spark.sql.execution.debug import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.TestData._ -import org.apache.spark.sql.test.TestSQLContext._ +import org.apache.spark.sql.test.SharedSQLContext + +class DebuggingSuite extends SparkFunSuite with SharedSQLContext { -class DebuggingSuite extends SparkFunSuite { test("DataFrame.debug()") { testData.debug() } - - test("DataFrame.typeCheck()") { - testData.typeCheck() - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala new file mode 100644 index 000000000000..53a0e53fd771 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala @@ -0,0 +1,87 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql.execution.joins + +import scala.reflect.ClassTag + +import org.scalatest.BeforeAndAfterAll + +import org.apache.spark.{AccumulatorSuite, SparkConf, SparkContext} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.{SQLConf, SQLContext, QueryTest} + +/** + * Test various broadcast join operators with unsafe enabled. + * + * Tests in this suite we need to run Spark in local-cluster mode. In particular, the use of + * unsafe map in [[org.apache.spark.sql.execution.joins.UnsafeHashedRelation]] is not triggered + * without serializing the hashed relation, which does not happen in local mode. + */ +class BroadcastJoinSuite extends QueryTest with BeforeAndAfterAll { + private var sc: SparkContext = null + private var sqlContext: SQLContext = null + + /** + * Create a new [[SQLContext]] running in local-cluster mode with unsafe and codegen enabled. + */ + override def beforeAll(): Unit = { + super.beforeAll() + val conf = new SparkConf() + .setMaster("local-cluster[2,1,1024]") + .setAppName("testing") + sc = new SparkContext(conf) + sqlContext = new SQLContext(sc) + sqlContext.setConf(SQLConf.UNSAFE_ENABLED, true) + sqlContext.setConf(SQLConf.CODEGEN_ENABLED, true) + } + + override def afterAll(): Unit = { + sc.stop() + sc = null + sqlContext = null + } + + /** + * Test whether the specified broadcast join updates the peak execution memory accumulator. + */ + private def testBroadcastJoin[T: ClassTag](name: String, joinType: String): Unit = { + AccumulatorSuite.verifyPeakExecutionMemorySet(sc, name) { + val df1 = sqlContext.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value") + val df2 = sqlContext.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value") + // Comparison at the end is for broadcast left semi join + val joinExpression = df1("key") === df2("key") && df1("value") > df2("value") + val df3 = df1.join(broadcast(df2), joinExpression, joinType) + val plan = df3.queryExecution.executedPlan + assert(plan.collect { case p: T => p }.size === 1) + plan.executeCollect() + } + } + + test("unsafe broadcast hash join updates peak execution memory") { + testBroadcastJoin[BroadcastHashJoin]("unsafe broadcast hash join", "inner") + } + + test("unsafe broadcast hash outer join updates peak execution memory") { + testBroadcastJoin[BroadcastHashOuterJoin]("unsafe broadcast hash outer join", "left_outer") + } + + test("unsafe broadcast left semi join updates peak execution memory") { + testBroadcastJoin[BroadcastLeftSemiJoinHash]("unsafe broadcast left semi join", "leftsemi") + } + +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala index 8b1a9b21a96b..4c9187a9a710 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala @@ -22,11 +22,13 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.execution.metric.SQLMetrics +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.apache.spark.util.collection.CompactBuffer -class HashedRelationSuite extends SparkFunSuite { +class HashedRelationSuite extends SparkFunSuite with SharedSQLContext { // Key is simply the record itself private val keyProjection = new Projection { @@ -35,7 +37,8 @@ class HashedRelationSuite extends SparkFunSuite { test("GeneralHashedRelation") { val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2)) - val hashed = HashedRelation(data.iterator, keyProjection) + val numDataRows = SQLMetrics.createLongMetric(ctx.sparkContext, "data") + val hashed = HashedRelation(data.iterator, numDataRows, keyProjection) assert(hashed.isInstanceOf[GeneralHashedRelation]) assert(hashed.get(data(0)) === CompactBuffer[InternalRow](data(0))) @@ -45,11 +48,13 @@ class HashedRelationSuite extends SparkFunSuite { val data2 = CompactBuffer[InternalRow](data(2)) data2 += data(2) assert(hashed.get(data(2)) === data2) + assert(numDataRows.value.value === data.length) } test("UniqueKeyHashedRelation") { val data = Array(InternalRow(0), InternalRow(1), InternalRow(2)) - val hashed = HashedRelation(data.iterator, keyProjection) + val numDataRows = SQLMetrics.createLongMetric(ctx.sparkContext, "data") + val hashed = HashedRelation(data.iterator, numDataRows, keyProjection) assert(hashed.isInstanceOf[UniqueKeyHashedRelation]) assert(hashed.get(data(0)) === CompactBuffer[InternalRow](data(0))) @@ -62,17 +67,19 @@ class HashedRelationSuite extends SparkFunSuite { assert(uniqHashed.getValue(data(1)) === data(1)) assert(uniqHashed.getValue(data(2)) === data(2)) assert(uniqHashed.getValue(InternalRow(10)) === null) + assert(numDataRows.value.value === data.length) } test("UnsafeHashedRelation") { val schema = StructType(StructField("a", IntegerType, true) :: Nil) val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2)) + val numDataRows = SQLMetrics.createLongMetric(ctx.sparkContext, "data") val toUnsafe = UnsafeProjection.create(schema) val unsafeData = data.map(toUnsafe(_).copy()).toArray val buildKey = Seq(BoundReference(0, IntegerType, false)) val keyGenerator = UnsafeProjection.create(buildKey) - val hashed = UnsafeHashedRelation(unsafeData.iterator, keyGenerator, 1) + val hashed = UnsafeHashedRelation(unsafeData.iterator, numDataRows, keyGenerator, 1) assert(hashed.isInstanceOf[UnsafeHashedRelation]) assert(hashed.get(unsafeData(0)) === CompactBuffer[InternalRow](unsafeData(0))) @@ -94,5 +101,37 @@ class HashedRelationSuite extends SparkFunSuite { assert(hashed2.get(unsafeData(1)) === CompactBuffer[InternalRow](unsafeData(1))) assert(hashed2.get(toUnsafe(InternalRow(10))) === null) assert(hashed2.get(unsafeData(2)) === data2) + assert(numDataRows.value.value === data.length) + + val os2 = new ByteArrayOutputStream() + val out2 = new ObjectOutputStream(os2) + hashed2.asInstanceOf[UnsafeHashedRelation].writeExternal(out2) + out2.flush() + // This depends on that the order of items in BytesToBytesMap.iterator() is exactly the same + // as they are inserted + assert(java.util.Arrays.equals(os2.toByteArray, os.toByteArray)) + } + + test("test serialization empty hash map") { + val os = new ByteArrayOutputStream() + val out = new ObjectOutputStream(os) + val hashed = new UnsafeHashedRelation( + new java.util.HashMap[UnsafeRow, CompactBuffer[UnsafeRow]]) + hashed.writeExternal(out) + out.flush() + val in = new ObjectInputStream(new ByteArrayInputStream(os.toByteArray)) + val hashed2 = new UnsafeHashedRelation() + hashed2.readExternal(in) + + val schema = StructType(StructField("a", IntegerType, true) :: Nil) + val toUnsafe = UnsafeProjection.create(schema) + val row = toUnsafe(InternalRow(0)) + assert(hashed2.get(row) === null) + + val os2 = new ByteArrayOutputStream() + val out2 = new ObjectOutputStream(os2) + hashed2.writeExternal(out2) + out2.flush() + assert(java.util.Arrays.equals(os2.toByteArray, os.toByteArray)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala new file mode 100644 index 000000000000..cc649b9bd4c4 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.joins + +import org.apache.spark.sql.{DataFrame, execution, Row, SQLConf} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys +import org.apache.spark.sql.catalyst.plans.Inner +import org.apache.spark.sql.catalyst.plans.logical.Join +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types.{IntegerType, StringType, StructType} + +class InnerJoinSuite extends SparkPlanTest with SharedSQLContext { + + private lazy val myUpperCaseData = ctx.createDataFrame( + ctx.sparkContext.parallelize(Seq( + Row(1, "A"), + Row(2, "B"), + Row(3, "C"), + Row(4, "D"), + Row(5, "E"), + Row(6, "F"), + Row(null, "G") + )), new StructType().add("N", IntegerType).add("L", StringType)) + + private lazy val myLowerCaseData = ctx.createDataFrame( + ctx.sparkContext.parallelize(Seq( + Row(1, "a"), + Row(2, "b"), + Row(3, "c"), + Row(4, "d"), + Row(null, "e") + )), new StructType().add("n", IntegerType).add("l", StringType)) + + private lazy val myTestData = Seq( + (1, 1), + (1, 2), + (2, 1), + (2, 2), + (3, 1), + (3, 2) + ).toDF("a", "b") + + // Note: the input dataframes and expression must be evaluated lazily because + // the SQLContext should be used only within a test to keep SQL tests stable + private def testInnerJoin( + testName: String, + leftRows: => DataFrame, + rightRows: => DataFrame, + condition: () => Expression, + expectedAnswer: Seq[Product]): Unit = { + + def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = { + val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition())) + ExtractEquiJoinKeys.unapply(join) + } + + def makeBroadcastHashJoin( + leftKeys: Seq[Expression], + rightKeys: Seq[Expression], + boundCondition: Option[Expression], + leftPlan: SparkPlan, + rightPlan: SparkPlan, + side: BuildSide) = { + val broadcastHashJoin = + execution.joins.BroadcastHashJoin(leftKeys, rightKeys, side, leftPlan, rightPlan) + boundCondition.map(Filter(_, broadcastHashJoin)).getOrElse(broadcastHashJoin) + } + + def makeShuffledHashJoin( + leftKeys: Seq[Expression], + rightKeys: Seq[Expression], + boundCondition: Option[Expression], + leftPlan: SparkPlan, + rightPlan: SparkPlan, + side: BuildSide) = { + val shuffledHashJoin = + execution.joins.ShuffledHashJoin(leftKeys, rightKeys, side, leftPlan, rightPlan) + val filteredJoin = + boundCondition.map(Filter(_, shuffledHashJoin)).getOrElse(shuffledHashJoin) + EnsureRequirements(sqlContext).apply(filteredJoin) + } + + def makeSortMergeJoin( + leftKeys: Seq[Expression], + rightKeys: Seq[Expression], + boundCondition: Option[Expression], + leftPlan: SparkPlan, + rightPlan: SparkPlan) = { + val sortMergeJoin = + execution.joins.SortMergeJoin(leftKeys, rightKeys, leftPlan, rightPlan) + val filteredJoin = boundCondition.map(Filter(_, sortMergeJoin)).getOrElse(sortMergeJoin) + EnsureRequirements(sqlContext).apply(filteredJoin) + } + + test(s"$testName using BroadcastHashJoin (build=left)") { + extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) => + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (leftPlan: SparkPlan, rightPlan: SparkPlan) => + makeBroadcastHashJoin( + leftKeys, rightKeys, boundCondition, leftPlan, rightPlan, joins.BuildLeft), + expectedAnswer.map(Row.fromTuple), + sortAnswers = true) + } + } + } + + test(s"$testName using BroadcastHashJoin (build=right)") { + extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) => + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (leftPlan: SparkPlan, rightPlan: SparkPlan) => + makeBroadcastHashJoin( + leftKeys, rightKeys, boundCondition, leftPlan, rightPlan, joins.BuildRight), + expectedAnswer.map(Row.fromTuple), + sortAnswers = true) + } + } + } + + test(s"$testName using ShuffledHashJoin (build=left)") { + extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) => + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (leftPlan: SparkPlan, rightPlan: SparkPlan) => + makeShuffledHashJoin( + leftKeys, rightKeys, boundCondition, leftPlan, rightPlan, joins.BuildLeft), + expectedAnswer.map(Row.fromTuple), + sortAnswers = true) + } + } + } + + test(s"$testName using ShuffledHashJoin (build=right)") { + extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) => + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (leftPlan: SparkPlan, rightPlan: SparkPlan) => + makeShuffledHashJoin( + leftKeys, rightKeys, boundCondition, leftPlan, rightPlan, joins.BuildRight), + expectedAnswer.map(Row.fromTuple), + sortAnswers = true) + } + } + } + + test(s"$testName using SortMergeJoin") { + extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) => + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (leftPlan: SparkPlan, rightPlan: SparkPlan) => + makeSortMergeJoin(leftKeys, rightKeys, boundCondition, leftPlan, rightPlan), + expectedAnswer.map(Row.fromTuple), + sortAnswers = true) + } + } + } + } + + testInnerJoin( + "inner join, one match per row", + myUpperCaseData, + myLowerCaseData, + () => (myUpperCaseData.col("N") === myLowerCaseData.col("n")).expr, + Seq( + (1, "A", 1, "a"), + (2, "B", 2, "b"), + (3, "C", 3, "c"), + (4, "D", 4, "d") + ) + ) + + { + lazy val left = myTestData.where("a = 1") + lazy val right = myTestData.where("a = 1") + testInnerJoin( + "inner join, multiple matches", + left, + right, + () => (left.col("a") === right.col("a")).expr, + Seq( + (1, 1, 1, 1), + (1, 1, 1, 2), + (1, 2, 1, 1), + (1, 2, 1, 2) + ) + ) + } + + { + lazy val left = myTestData.where("a = 1") + lazy val right = myTestData.where("a = 2") + testInnerJoin( + "inner join, no matches", + left, + right, + () => (left.col("a") === right.col("a")).expr, + Seq.empty + ) + } + +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala index 2c27da596bc4..a1a617d7b739 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala @@ -1,89 +1,214 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.joins - -import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.expressions.{Expression, LessThan} -import org.apache.spark.sql.catalyst.plans.{FullOuter, LeftOuter, RightOuter} -import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest} - -class OuterJoinSuite extends SparkPlanTest { - - val left = Seq( - (1, 2.0), - (2, 1.0), - (3, 3.0) - ).toDF("a", "b") - - val right = Seq( - (2, 3.0), - (3, 2.0), - (4, 1.0) - ).toDF("c", "d") - - val leftKeys: List[Expression] = 'a :: Nil - val rightKeys: List[Expression] = 'c :: Nil - val condition = Some(LessThan('b, 'd)) - - test("shuffled hash outer join") { - checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) => - ShuffledHashOuterJoin(leftKeys, rightKeys, LeftOuter, condition, left, right), - Seq( - (1, 2.0, null, null), - (2, 1.0, 2, 3.0), - (3, 3.0, null, null) - ).map(Row.fromTuple)) - - checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) => - ShuffledHashOuterJoin(leftKeys, rightKeys, RightOuter, condition, left, right), - Seq( - (2, 1.0, 2, 3.0), - (null, null, 3, 2.0), - (null, null, 4, 1.0) - ).map(Row.fromTuple)) - - checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) => - ShuffledHashOuterJoin(leftKeys, rightKeys, FullOuter, condition, left, right), - Seq( - (1, 2.0, null, null), - (2, 1.0, 2, 3.0), - (3, 3.0, null, null), - (null, null, 3, 2.0), - (null, null, 4, 1.0) - ).map(Row.fromTuple)) - } - - test("broadcast hash outer join") { - checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) => - BroadcastHashOuterJoin(leftKeys, rightKeys, LeftOuter, condition, left, right), - Seq( - (1, 2.0, null, null), - (2, 1.0, 2, 3.0), - (3, 3.0, null, null) - ).map(Row.fromTuple)) - - checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) => - BroadcastHashOuterJoin(leftKeys, rightKeys, RightOuter, condition, left, right), - Seq( - (2, 1.0, 2, 3.0), - (null, null, 3, 2.0), - (null, null, 4, 1.0) - ).map(Row.fromTuple)) - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.joins + +import org.apache.spark.sql.{DataFrame, Row, SQLConf} +import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys +import org.apache.spark.sql.catalyst.plans._ +import org.apache.spark.sql.catalyst.plans.logical.Join +import org.apache.spark.sql.catalyst.expressions.{And, Expression, LessThan} +import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan, SparkPlanTest} +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types.{IntegerType, DoubleType, StructType} + +class OuterJoinSuite extends SparkPlanTest with SharedSQLContext { + + private lazy val left = ctx.createDataFrame( + ctx.sparkContext.parallelize(Seq( + Row(1, 2.0), + Row(2, 100.0), + Row(2, 1.0), // This row is duplicated to ensure that we will have multiple buffered matches + Row(2, 1.0), + Row(3, 3.0), + Row(5, 1.0), + Row(6, 6.0), + Row(null, null) + )), new StructType().add("a", IntegerType).add("b", DoubleType)) + + private lazy val right = ctx.createDataFrame( + ctx.sparkContext.parallelize(Seq( + Row(0, 0.0), + Row(2, 3.0), // This row is duplicated to ensure that we will have multiple buffered matches + Row(2, -1.0), + Row(2, -1.0), + Row(2, 3.0), + Row(3, 2.0), + Row(4, 1.0), + Row(5, 3.0), + Row(7, 7.0), + Row(null, null) + )), new StructType().add("c", IntegerType).add("d", DoubleType)) + + private lazy val condition = { + And((left.col("a") === right.col("c")).expr, + LessThan(left.col("b").expr, right.col("d").expr)) + } + + // Note: the input dataframes and expression must be evaluated lazily because + // the SQLContext should be used only within a test to keep SQL tests stable + private def testOuterJoin( + testName: String, + leftRows: => DataFrame, + rightRows: => DataFrame, + joinType: JoinType, + condition: => Expression, + expectedAnswer: Seq[Product]): Unit = { + + def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = { + val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition)) + ExtractEquiJoinKeys.unapply(join) + } + + test(s"$testName using ShuffledHashOuterJoin") { + extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) => + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => + EnsureRequirements(sqlContext).apply( + ShuffledHashOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right)), + expectedAnswer.map(Row.fromTuple), + sortAnswers = true) + } + } + } + + if (joinType != FullOuter) { + test(s"$testName using BroadcastHashOuterJoin") { + extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) => + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => + BroadcastHashOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right), + expectedAnswer.map(Row.fromTuple), + sortAnswers = true) + } + } + } + + test(s"$testName using SortMergeOuterJoin") { + extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) => + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => + EnsureRequirements(sqlContext).apply( + SortMergeOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right)), + expectedAnswer.map(Row.fromTuple), + sortAnswers = false) + } + } + } + } + } + + // --- Basic outer joins ------------------------------------------------------------------------ + + testOuterJoin( + "basic left outer join", + left, + right, + LeftOuter, + condition, + Seq( + (null, null, null, null), + (1, 2.0, null, null), + (2, 100.0, null, null), + (2, 1.0, 2, 3.0), + (2, 1.0, 2, 3.0), + (2, 1.0, 2, 3.0), + (2, 1.0, 2, 3.0), + (3, 3.0, null, null), + (5, 1.0, 5, 3.0), + (6, 6.0, null, null) + ) + ) + + testOuterJoin( + "basic right outer join", + left, + right, + RightOuter, + condition, + Seq( + (null, null, null, null), + (null, null, 0, 0.0), + (2, 1.0, 2, 3.0), + (2, 1.0, 2, 3.0), + (null, null, 2, -1.0), + (null, null, 2, -1.0), + (2, 1.0, 2, 3.0), + (2, 1.0, 2, 3.0), + (null, null, 3, 2.0), + (null, null, 4, 1.0), + (5, 1.0, 5, 3.0), + (null, null, 7, 7.0) + ) + ) + + testOuterJoin( + "basic full outer join", + left, + right, + FullOuter, + condition, + Seq( + (1, 2.0, null, null), + (null, null, 2, -1.0), + (null, null, 2, -1.0), + (2, 100.0, null, null), + (2, 1.0, 2, 3.0), + (2, 1.0, 2, 3.0), + (2, 1.0, 2, 3.0), + (2, 1.0, 2, 3.0), + (3, 3.0, null, null), + (5, 1.0, 5, 3.0), + (6, 6.0, null, null), + (null, null, 0, 0.0), + (null, null, 3, 2.0), + (null, null, 4, 1.0), + (null, null, 7, 7.0), + (null, null, null, null), + (null, null, null, null) + ) + ) + + // --- Both inputs empty ------------------------------------------------------------------------ + + testOuterJoin( + "left outer join with both inputs empty", + left.filter("false"), + right.filter("false"), + LeftOuter, + condition, + Seq.empty + ) + + testOuterJoin( + "right outer join with both inputs empty", + left.filter("false"), + right.filter("false"), + RightOuter, + condition, + Seq.empty + ) + + testOuterJoin( + "full outer join with both inputs empty", + left.filter("false"), + right.filter("false"), + FullOuter, + condition, + Seq.empty + ) +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala index 927e85a7db3d..baa86e320d98 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala @@ -17,58 +17,100 @@ package org.apache.spark.sql.execution.joins -import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.expressions.{LessThan, Expression} -import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest} +import org.apache.spark.sql.{SQLConf, DataFrame, Row} +import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys +import org.apache.spark.sql.catalyst.plans.Inner +import org.apache.spark.sql.catalyst.plans.logical.Join +import org.apache.spark.sql.catalyst.expressions.{And, LessThan, Expression} +import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan, SparkPlanTest} +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} +class SemiJoinSuite extends SparkPlanTest with SharedSQLContext { -class SemiJoinSuite extends SparkPlanTest{ - val left = Seq( - (1, 2.0), - (1, 2.0), - (2, 1.0), - (2, 1.0), - (3, 3.0) - ).toDF("a", "b") + private lazy val left = ctx.createDataFrame( + ctx.sparkContext.parallelize(Seq( + Row(1, 2.0), + Row(1, 2.0), + Row(2, 1.0), + Row(2, 1.0), + Row(3, 3.0), + Row(null, null), + Row(null, 5.0), + Row(6, null) + )), new StructType().add("a", IntegerType).add("b", DoubleType)) - val right = Seq( - (2, 3.0), - (2, 3.0), - (3, 2.0), - (4, 1.0) - ).toDF("c", "d") + private lazy val right = ctx.createDataFrame( + ctx.sparkContext.parallelize(Seq( + Row(2, 3.0), + Row(2, 3.0), + Row(3, 2.0), + Row(4, 1.0), + Row(null, null), + Row(null, 5.0), + Row(6, null) + )), new StructType().add("c", IntegerType).add("d", DoubleType)) - val leftKeys: List[Expression] = 'a :: Nil - val rightKeys: List[Expression] = 'c :: Nil - val condition = Some(LessThan('b, 'd)) - - test("left semi join hash") { - checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) => - LeftSemiJoinHash(leftKeys, rightKeys, left, right, condition), - Seq( - (2, 1.0), - (2, 1.0) - ).map(Row.fromTuple)) + private lazy val condition = { + And((left.col("a") === right.col("c")).expr, + LessThan(left.col("b").expr, right.col("d").expr)) } - test("left semi join BNL") { - checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) => - LeftSemiJoinBNL(left, right, condition), - Seq( - (1, 2.0), - (1, 2.0), - (2, 1.0), - (2, 1.0) - ).map(Row.fromTuple)) - } + // Note: the input dataframes and expression must be evaluated lazily because + // the SQLContext should be used only within a test to keep SQL tests stable + private def testLeftSemiJoin( + testName: String, + leftRows: => DataFrame, + rightRows: => DataFrame, + condition: => Expression, + expectedAnswer: Seq[Product]): Unit = { + + def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = { + val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition)) + ExtractEquiJoinKeys.unapply(join) + } - test("broadcast left semi join hash") { - checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) => - BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, condition), - Seq( - (2, 1.0), - (2, 1.0) - ).map(Row.fromTuple)) + test(s"$testName using LeftSemiJoinHash") { + extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => + EnsureRequirements(left.sqlContext).apply( + LeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition)), + expectedAnswer.map(Row.fromTuple), + sortAnswers = true) + } + } + } + + test(s"$testName using BroadcastLeftSemiJoinHash") { + extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => + BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition), + expectedAnswer.map(Row.fromTuple), + sortAnswers = true) + } + } + } + + test(s"$testName using LeftSemiJoinBNL") { + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => + LeftSemiJoinBNL(left, right, Some(condition)), + expectedAnswer.map(Row.fromTuple), + sortAnswers = true) + } + } } + + testLeftSemiJoin( + "basic test", + left, + right, + condition, + Seq( + (2, 1.0), + (2, 1.0) + ) + ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala new file mode 100644 index 000000000000..80006bf077fe --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -0,0 +1,577 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql.execution.metric + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream} + +import scala.collection.mutable + +import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm._ +import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.Opcodes._ + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql._ +import org.apache.spark.sql.execution.ui.SparkPlanGraph +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.util.Utils + + +class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext { + import testImplicits._ + + test("LongSQLMetric should not box Long") { + val l = SQLMetrics.createLongMetric(ctx.sparkContext, "long") + val f = () => { + l += 1L + l.add(1L) + } + BoxingFinder.getClassReader(f.getClass).foreach { cl => + val boxingFinder = new BoxingFinder() + cl.accept(boxingFinder, 0) + assert(boxingFinder.boxingInvokes.isEmpty, s"Found boxing: ${boxingFinder.boxingInvokes}") + } + } + + test("Normal accumulator should do boxing") { + // We need this test to make sure BoxingFinder works. + val l = ctx.sparkContext.accumulator(0L) + val f = () => { l += 1L } + BoxingFinder.getClassReader(f.getClass).foreach { cl => + val boxingFinder = new BoxingFinder() + cl.accept(boxingFinder, 0) + assert(boxingFinder.boxingInvokes.nonEmpty, "Found find boxing in this test") + } + } + + /** + * Call `df.collect()` and verify if the collected metrics are same as "expectedMetrics". + * + * @param df `DataFrame` to run + * @param expectedNumOfJobs number of jobs that will run + * @param expectedMetrics the expected metrics. The format is + * `nodeId -> (operatorName, metric name -> metric value)`. + */ + private def testSparkPlanMetrics( + df: DataFrame, + expectedNumOfJobs: Int, + expectedMetrics: Map[Long, (String, Map[String, Any])]): Unit = { + val previousExecutionIds = ctx.listener.executionIdToData.keySet + df.collect() + ctx.sparkContext.listenerBus.waitUntilEmpty(10000) + val executionIds = ctx.listener.executionIdToData.keySet.diff(previousExecutionIds) + assert(executionIds.size === 1) + val executionId = executionIds.head + val jobs = ctx.listener.getExecution(executionId).get.jobs + // Use "<=" because there is a race condition that we may miss some jobs + // TODO Change it to "=" once we fix the race condition that missing the JobStarted event. + assert(jobs.size <= expectedNumOfJobs) + if (jobs.size == expectedNumOfJobs) { + // If we can track all jobs, check the metric values + val metricValues = ctx.listener.getExecutionMetrics(executionId) + val actualMetrics = SparkPlanGraph(df.queryExecution.executedPlan).nodes.filter { node => + expectedMetrics.contains(node.id) + }.map { node => + val nodeMetrics = node.metrics.map { metric => + val metricValue = metricValues(metric.accumulatorId) + (metric.name, metricValue) + }.toMap + (node.id, node.name -> nodeMetrics) + }.toMap + assert(expectedMetrics === actualMetrics) + } else { + // TODO Remove this "else" once we fix the race condition that missing the JobStarted event. + // Since we cannot track all jobs, the metric values could be wrong and we should not check + // them. + logWarning("Due to a race condition, we miss some jobs and cannot verify the metric values") + } + } + + test("Project metrics") { + withSQLConf( + SQLConf.UNSAFE_ENABLED.key -> "false", + SQLConf.CODEGEN_ENABLED.key -> "false", + SQLConf.TUNGSTEN_ENABLED.key -> "false") { + // Assume the execution plan is + // PhysicalRDD(nodeId = 1) -> Project(nodeId = 0) + val df = person.select('name) + testSparkPlanMetrics(df, 1, Map( + 0L ->("Project", Map( + "number of rows" -> 2L))) + ) + } + } + + test("TungstenProject metrics") { + withSQLConf( + SQLConf.UNSAFE_ENABLED.key -> "true", + SQLConf.CODEGEN_ENABLED.key -> "true", + SQLConf.TUNGSTEN_ENABLED.key -> "true") { + // Assume the execution plan is + // PhysicalRDD(nodeId = 1) -> TungstenProject(nodeId = 0) + val df = person.select('name) + testSparkPlanMetrics(df, 1, Map( + 0L ->("TungstenProject", Map( + "number of rows" -> 2L))) + ) + } + } + + test("Filter metrics") { + // Assume the execution plan is + // PhysicalRDD(nodeId = 1) -> Filter(nodeId = 0) + val df = person.filter('age < 25) + testSparkPlanMetrics(df, 1, Map( + 0L -> ("Filter", Map( + "number of input rows" -> 2L, + "number of output rows" -> 1L))) + ) + } + + test("Aggregate metrics") { + withSQLConf( + SQLConf.UNSAFE_ENABLED.key -> "false", + SQLConf.CODEGEN_ENABLED.key -> "false", + SQLConf.TUNGSTEN_ENABLED.key -> "false") { + // Assume the execution plan is + // ... -> Aggregate(nodeId = 2) -> TungstenExchange(nodeId = 1) -> Aggregate(nodeId = 0) + val df = testData2.groupBy().count() // 2 partitions + testSparkPlanMetrics(df, 1, Map( + 2L -> ("Aggregate", Map( + "number of input rows" -> 6L, + "number of output rows" -> 2L)), + 0L -> ("Aggregate", Map( + "number of input rows" -> 2L, + "number of output rows" -> 1L))) + ) + + // 2 partitions and each partition contains 2 keys + val df2 = testData2.groupBy('a).count() + testSparkPlanMetrics(df2, 1, Map( + 2L -> ("Aggregate", Map( + "number of input rows" -> 6L, + "number of output rows" -> 4L)), + 0L -> ("Aggregate", Map( + "number of input rows" -> 4L, + "number of output rows" -> 3L))) + ) + } + } + + test("SortBasedAggregate metrics") { + // Because SortBasedAggregate may skip different rows if the number of partitions is different, + // this test should use the deterministic number of partitions. + withSQLConf( + SQLConf.UNSAFE_ENABLED.key -> "false", + SQLConf.CODEGEN_ENABLED.key -> "true", + SQLConf.TUNGSTEN_ENABLED.key -> "true") { + // Assume the execution plan is + // ... -> SortBasedAggregate(nodeId = 2) -> TungstenExchange(nodeId = 1) -> + // SortBasedAggregate(nodeId = 0) + val df = testData2.groupBy().count() // 2 partitions + testSparkPlanMetrics(df, 1, Map( + 2L -> ("SortBasedAggregate", Map( + "number of input rows" -> 6L, + "number of output rows" -> 2L)), + 0L -> ("SortBasedAggregate", Map( + "number of input rows" -> 2L, + "number of output rows" -> 1L))) + ) + + // Assume the execution plan is + // ... -> SortBasedAggregate(nodeId = 3) -> TungstenExchange(nodeId = 2) + // -> ExternalSort(nodeId = 1)-> SortBasedAggregate(nodeId = 0) + // 2 partitions and each partition contains 2 keys + val df2 = testData2.groupBy('a).count() + testSparkPlanMetrics(df2, 1, Map( + 3L -> ("SortBasedAggregate", Map( + "number of input rows" -> 6L, + "number of output rows" -> 4L)), + 0L -> ("SortBasedAggregate", Map( + "number of input rows" -> 4L, + "number of output rows" -> 3L))) + ) + } + } + + test("TungstenAggregate metrics") { + withSQLConf( + SQLConf.UNSAFE_ENABLED.key -> "true", + SQLConf.CODEGEN_ENABLED.key -> "true", + SQLConf.TUNGSTEN_ENABLED.key -> "true") { + // Assume the execution plan is + // ... -> TungstenAggregate(nodeId = 2) -> Exchange(nodeId = 1) + // -> TungstenAggregate(nodeId = 0) + val df = testData2.groupBy().count() // 2 partitions + testSparkPlanMetrics(df, 1, Map( + 2L -> ("TungstenAggregate", Map( + "number of input rows" -> 6L, + "number of output rows" -> 2L)), + 0L -> ("TungstenAggregate", Map( + "number of input rows" -> 2L, + "number of output rows" -> 1L))) + ) + + // 2 partitions and each partition contains 2 keys + val df2 = testData2.groupBy('a).count() + testSparkPlanMetrics(df2, 1, Map( + 2L -> ("TungstenAggregate", Map( + "number of input rows" -> 6L, + "number of output rows" -> 4L)), + 0L -> ("TungstenAggregate", Map( + "number of input rows" -> 4L, + "number of output rows" -> 3L))) + ) + } + } + + test("SortMergeJoin metrics") { + // Because SortMergeJoin may skip different rows if the number of partitions is different, this + // test should use the deterministic number of partitions. + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "true") { + val testDataForJoin = testData2.filter('a < 2) // TestData2(1, 1) :: TestData2(1, 2) + testDataForJoin.registerTempTable("testDataForJoin") + withTempTable("testDataForJoin") { + // Assume the execution plan is + // ... -> SortMergeJoin(nodeId = 1) -> TungstenProject(nodeId = 0) + val df = sqlContext.sql( + "SELECT * FROM testData2 JOIN testDataForJoin ON testData2.a = testDataForJoin.a") + testSparkPlanMetrics(df, 1, Map( + 1L -> ("SortMergeJoin", Map( + // It's 4 because we only read 3 rows in the first partition and 1 row in the second one + "number of left rows" -> 4L, + "number of right rows" -> 2L, + "number of output rows" -> 4L))) + ) + } + } + } + + test("SortMergeOuterJoin metrics") { + // Because SortMergeOuterJoin may skip different rows if the number of partitions is different, + // this test should use the deterministic number of partitions. + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "true") { + val testDataForJoin = testData2.filter('a < 2) // TestData2(1, 1) :: TestData2(1, 2) + testDataForJoin.registerTempTable("testDataForJoin") + withTempTable("testDataForJoin") { + // Assume the execution plan is + // ... -> SortMergeOuterJoin(nodeId = 1) -> TungstenProject(nodeId = 0) + val df = sqlContext.sql( + "SELECT * FROM testData2 left JOIN testDataForJoin ON testData2.a = testDataForJoin.a") + testSparkPlanMetrics(df, 1, Map( + 1L -> ("SortMergeOuterJoin", Map( + // It's 4 because we only read 3 rows in the first partition and 1 row in the second one + "number of left rows" -> 6L, + "number of right rows" -> 2L, + "number of output rows" -> 8L))) + ) + + val df2 = sqlContext.sql( + "SELECT * FROM testDataForJoin right JOIN testData2 ON testData2.a = testDataForJoin.a") + testSparkPlanMetrics(df2, 1, Map( + 1L -> ("SortMergeOuterJoin", Map( + // It's 4 because we only read 3 rows in the first partition and 1 row in the second one + "number of left rows" -> 2L, + "number of right rows" -> 6L, + "number of output rows" -> 8L))) + ) + } + } + } + + test("BroadcastHashJoin metrics") { + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "false") { + val df1 = Seq((1, "1"), (2, "2")).toDF("key", "value") + val df2 = Seq((1, "1"), (2, "2"), (3, "3"), (4, "4")).toDF("key", "value") + // Assume the execution plan is + // ... -> BroadcastHashJoin(nodeId = 1) -> TungstenProject(nodeId = 0) + val df = df1.join(broadcast(df2), "key") + testSparkPlanMetrics(df, 2, Map( + 1L -> ("BroadcastHashJoin", Map( + "number of left rows" -> 2L, + "number of right rows" -> 4L, + "number of output rows" -> 2L))) + ) + } + } + + test("ShuffledHashJoin metrics") { + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "false") { + val testDataForJoin = testData2.filter('a < 2) // TestData2(1, 1) :: TestData2(1, 2) + testDataForJoin.registerTempTable("testDataForJoin") + withTempTable("testDataForJoin") { + // Assume the execution plan is + // ... -> ShuffledHashJoin(nodeId = 1) -> TungstenProject(nodeId = 0) + val df = sqlContext.sql( + "SELECT * FROM testData2 JOIN testDataForJoin ON testData2.a = testDataForJoin.a") + testSparkPlanMetrics(df, 1, Map( + 1L -> ("ShuffledHashJoin", Map( + "number of left rows" -> 6L, + "number of right rows" -> 2L, + "number of output rows" -> 4L))) + ) + } + } + } + + test("ShuffledHashOuterJoin metrics") { + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "false", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0") { + val df1 = Seq((1, "a"), (1, "b"), (4, "c")).toDF("key", "value") + val df2 = Seq((1, "a"), (1, "b"), (2, "c"), (3, "d")).toDF("key2", "value") + // Assume the execution plan is + // ... -> ShuffledHashOuterJoin(nodeId = 0) + val df = df1.join(df2, $"key" === $"key2", "left_outer") + testSparkPlanMetrics(df, 1, Map( + 0L -> ("ShuffledHashOuterJoin", Map( + "number of left rows" -> 3L, + "number of right rows" -> 4L, + "number of output rows" -> 5L))) + ) + + val df3 = df1.join(df2, $"key" === $"key2", "right_outer") + testSparkPlanMetrics(df3, 1, Map( + 0L -> ("ShuffledHashOuterJoin", Map( + "number of left rows" -> 3L, + "number of right rows" -> 4L, + "number of output rows" -> 6L))) + ) + + val df4 = df1.join(df2, $"key" === $"key2", "outer") + testSparkPlanMetrics(df4, 1, Map( + 0L -> ("ShuffledHashOuterJoin", Map( + "number of left rows" -> 3L, + "number of right rows" -> 4L, + "number of output rows" -> 7L))) + ) + } + } + + test("BroadcastHashOuterJoin metrics") { + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "false") { + val df1 = Seq((1, "a"), (1, "b"), (4, "c")).toDF("key", "value") + val df2 = Seq((1, "a"), (1, "b"), (2, "c"), (3, "d")).toDF("key2", "value") + // Assume the execution plan is + // ... -> BroadcastHashOuterJoin(nodeId = 0) + val df = df1.join(broadcast(df2), $"key" === $"key2", "left_outer") + testSparkPlanMetrics(df, 2, Map( + 0L -> ("BroadcastHashOuterJoin", Map( + "number of left rows" -> 3L, + "number of right rows" -> 4L, + "number of output rows" -> 5L))) + ) + + val df3 = df1.join(broadcast(df2), $"key" === $"key2", "right_outer") + testSparkPlanMetrics(df3, 2, Map( + 0L -> ("BroadcastHashOuterJoin", Map( + "number of left rows" -> 3L, + "number of right rows" -> 4L, + "number of output rows" -> 6L))) + ) + } + } + + test("BroadcastNestedLoopJoin metrics") { + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "true") { + val testDataForJoin = testData2.filter('a < 2) // TestData2(1, 1) :: TestData2(1, 2) + testDataForJoin.registerTempTable("testDataForJoin") + withTempTable("testDataForJoin") { + // Assume the execution plan is + // ... -> BroadcastNestedLoopJoin(nodeId = 1) -> TungstenProject(nodeId = 0) + val df = sqlContext.sql( + "SELECT * FROM testData2 left JOIN testDataForJoin ON " + + "testData2.a * testDataForJoin.a != testData2.a + testDataForJoin.a") + testSparkPlanMetrics(df, 3, Map( + 1L -> ("BroadcastNestedLoopJoin", Map( + "number of left rows" -> 12L, // left needs to be scanned twice + "number of right rows" -> 2L, + "number of output rows" -> 12L))) + ) + } + } + } + + test("BroadcastLeftSemiJoinHash metrics") { + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "false") { + val df1 = Seq((1, "1"), (2, "2")).toDF("key", "value") + val df2 = Seq((1, "1"), (2, "2"), (3, "3"), (4, "4")).toDF("key2", "value") + // Assume the execution plan is + // ... -> BroadcastLeftSemiJoinHash(nodeId = 0) + val df = df1.join(broadcast(df2), $"key" === $"key2", "leftsemi") + testSparkPlanMetrics(df, 2, Map( + 0L -> ("BroadcastLeftSemiJoinHash", Map( + "number of left rows" -> 2L, + "number of right rows" -> 4L, + "number of output rows" -> 2L))) + ) + } + } + + test("LeftSemiJoinHash metrics") { + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0") { + val df1 = Seq((1, "1"), (2, "2")).toDF("key", "value") + val df2 = Seq((1, "1"), (2, "2"), (3, "3"), (4, "4")).toDF("key2", "value") + // Assume the execution plan is + // ... -> LeftSemiJoinHash(nodeId = 0) + val df = df1.join(df2, $"key" === $"key2", "leftsemi") + testSparkPlanMetrics(df, 1, Map( + 0L -> ("LeftSemiJoinHash", Map( + "number of left rows" -> 2L, + "number of right rows" -> 4L, + "number of output rows" -> 2L))) + ) + } + } + + test("LeftSemiJoinBNL metrics") { + withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "false") { + val df1 = Seq((1, "1"), (2, "2")).toDF("key", "value") + val df2 = Seq((1, "1"), (2, "2"), (3, "3"), (4, "4")).toDF("key2", "value") + // Assume the execution plan is + // ... -> LeftSemiJoinBNL(nodeId = 0) + val df = df1.join(df2, $"key" < $"key2", "leftsemi") + testSparkPlanMetrics(df, 2, Map( + 0L -> ("LeftSemiJoinBNL", Map( + "number of left rows" -> 2L, + "number of right rows" -> 4L, + "number of output rows" -> 2L))) + ) + } + } + + test("CartesianProduct metrics") { + val testDataForJoin = testData2.filter('a < 2) // TestData2(1, 1) :: TestData2(1, 2) + testDataForJoin.registerTempTable("testDataForJoin") + withTempTable("testDataForJoin") { + // Assume the execution plan is + // ... -> CartesianProduct(nodeId = 1) -> TungstenProject(nodeId = 0) + val df = sqlContext.sql( + "SELECT * FROM testData2 JOIN testDataForJoin") + testSparkPlanMetrics(df, 1, Map( + 1L -> ("CartesianProduct", Map( + "number of left rows" -> 12L, // left needs to be scanned twice + "number of right rows" -> 12L, // right is read 6 times + "number of output rows" -> 12L))) + ) + } + } + + test("save metrics") { + withTempPath { file => + val previousExecutionIds = ctx.listener.executionIdToData.keySet + // Assume the execution plan is + // PhysicalRDD(nodeId = 0) + person.select('name).write.format("json").save(file.getAbsolutePath) + ctx.sparkContext.listenerBus.waitUntilEmpty(10000) + val executionIds = ctx.listener.executionIdToData.keySet.diff(previousExecutionIds) + assert(executionIds.size === 1) + val executionId = executionIds.head + val jobs = ctx.listener.getExecution(executionId).get.jobs + // Use "<=" because there is a race condition that we may miss some jobs + // TODO Change "<=" to "=" once we fix the race condition that missing the JobStarted event. + assert(jobs.size <= 1) + val metricValues = ctx.listener.getExecutionMetrics(executionId) + // Because "save" will create a new DataFrame internally, we cannot get the real metric id. + // However, we still can check the value. + assert(metricValues.values.toSeq === Seq(2L)) + } + } + +} + +private case class MethodIdentifier[T](cls: Class[T], name: String, desc: String) + +/** + * If `method` is null, search all methods of this class recursively to find if they do some boxing. + * If `method` is specified, only search this method of the class to speed up the searching. + * + * This method will skip the methods in `visitedMethods` to avoid potential infinite cycles. + */ +private class BoxingFinder( + method: MethodIdentifier[_] = null, + val boxingInvokes: mutable.Set[String] = mutable.Set.empty, + visitedMethods: mutable.Set[MethodIdentifier[_]] = mutable.Set.empty) + extends ClassVisitor(ASM4) { + + private val primitiveBoxingClassName = + Set("java/lang/Long", + "java/lang/Double", + "java/lang/Integer", + "java/lang/Float", + "java/lang/Short", + "java/lang/Character", + "java/lang/Byte", + "java/lang/Boolean") + + override def visitMethod( + access: Int, name: String, desc: String, sig: String, exceptions: Array[String]): + MethodVisitor = { + if (method != null && (method.name != name || method.desc != desc)) { + // If method is specified, skip other methods. + return new MethodVisitor(ASM4) {} + } + + new MethodVisitor(ASM4) { + override def visitMethodInsn(op: Int, owner: String, name: String, desc: String) { + if (op == INVOKESPECIAL && name == "" || op == INVOKESTATIC && name == "valueOf") { + if (primitiveBoxingClassName.contains(owner)) { + // Find boxing methods, e.g, new java.lang.Long(l) or java.lang.Long.valueOf(l) + boxingInvokes.add(s"$owner.$name") + } + } else { + // scalastyle:off classforname + val classOfMethodOwner = Class.forName(owner.replace('/', '.'), false, + Thread.currentThread.getContextClassLoader) + // scalastyle:on classforname + val m = MethodIdentifier(classOfMethodOwner, name, desc) + if (!visitedMethods.contains(m)) { + // Keep track of visited methods to avoid potential infinite cycles + visitedMethods += m + BoxingFinder.getClassReader(classOfMethodOwner).foreach { cl => + visitedMethods += m + cl.accept(new BoxingFinder(m, boxingInvokes, visitedMethods), 0) + } + } + } + } + } + } +} + +private object BoxingFinder { + + def getClassReader(cls: Class[_]): Option[ClassReader] = { + val className = cls.getName.replaceFirst("^.*\\.", "") + ".class" + val resourceStream = cls.getResourceAsStream(className) + val baos = new ByteArrayOutputStream(128) + // Copy data over, before delegating to ClassReader - + // else we can run out of open file handles. + Utils.copyStream(resourceStream, baos, true) + // ASM4 doesn't support Java 8 classes, which requires ASM5. + // So if the class is ASM5 (E.g., java.lang.Long when using JDK8 runtime to run these codes), + // then ClassReader will throw IllegalArgumentException, + // However, since this is only for testing, it's safe to skip these classes. + try { + Some(new ClassReader(new ByteArrayInputStream(baos.toByteArray))) + } catch { + case _: IllegalArgumentException => None + } + } + +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala new file mode 100644 index 000000000000..80d1e8895694 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala @@ -0,0 +1,348 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.ui + +import java.util.Properties + +import org.apache.spark.{SparkException, SparkContext, SparkConf, SparkFunSuite} +import org.apache.spark.executor.TaskMetrics +import org.apache.spark.sql.execution.metric.LongSQLMetricValue +import org.apache.spark.scheduler._ +import org.apache.spark.sql.{DataFrame, SQLContext} +import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.test.SharedSQLContext + +class SQLListenerSuite extends SparkFunSuite with SharedSQLContext { + import testImplicits._ + + private def createTestDataFrame: DataFrame = { + Seq( + (1, 1), + (2, 2) + ).toDF().filter("_1 > 1") + } + + private def createProperties(executionId: Long): Properties = { + val properties = new Properties() + properties.setProperty(SQLExecution.EXECUTION_ID_KEY, executionId.toString) + properties + } + + private def createStageInfo(stageId: Int, attemptId: Int): StageInfo = new StageInfo( + stageId = stageId, + attemptId = attemptId, + // The following fields are not used in tests + name = "", + numTasks = 0, + rddInfos = Nil, + parentIds = Nil, + details = "" + ) + + private def createTaskInfo(taskId: Int, attempt: Int): TaskInfo = new TaskInfo( + taskId = taskId, + attempt = attempt, + // The following fields are not used in tests + index = 0, + launchTime = 0, + executorId = "", + host = "", + taskLocality = null, + speculative = false + ) + + private def createTaskMetrics(accumulatorUpdates: Map[Long, Long]): TaskMetrics = { + val metrics = new TaskMetrics + metrics.setAccumulatorsUpdater(() => accumulatorUpdates.mapValues(new LongSQLMetricValue(_))) + metrics.updateAccumulators() + metrics + } + + test("basic") { + val listener = new SQLListener(ctx) + val executionId = 0 + val df = createTestDataFrame + val accumulatorIds = + SparkPlanGraph(df.queryExecution.executedPlan).nodes.flatMap(_.metrics.map(_.accumulatorId)) + // Assume all accumulators are long + var accumulatorValue = 0L + val accumulatorUpdates = accumulatorIds.map { id => + accumulatorValue += 1L + (id, accumulatorValue) + }.toMap + + listener.onExecutionStart( + executionId, + "test", + "test", + df.queryExecution.toString, + SparkPlanGraph(df.queryExecution.executedPlan), + System.currentTimeMillis()) + + val executionUIData = listener.executionIdToData(0) + + listener.onJobStart(SparkListenerJobStart( + jobId = 0, + time = System.currentTimeMillis(), + stageInfos = Seq( + createStageInfo(0, 0), + createStageInfo(1, 0) + ), + createProperties(executionId))) + listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(0, 0))) + + assert(listener.getExecutionMetrics(0).isEmpty) + + listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq( + // (task id, stage id, stage attempt, metrics) + (0L, 0, 0, createTaskMetrics(accumulatorUpdates)), + (1L, 0, 0, createTaskMetrics(accumulatorUpdates)) + ))) + + assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 2)) + + listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq( + // (task id, stage id, stage attempt, metrics) + (0L, 0, 0, createTaskMetrics(accumulatorUpdates)), + (1L, 0, 0, createTaskMetrics(accumulatorUpdates.mapValues(_ * 2))) + ))) + + assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 3)) + + // Retrying a stage should reset the metrics + listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(0, 1))) + + listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq( + // (task id, stage id, stage attempt, metrics) + (0L, 0, 1, createTaskMetrics(accumulatorUpdates)), + (1L, 0, 1, createTaskMetrics(accumulatorUpdates)) + ))) + + assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 2)) + + // Ignore the task end for the first attempt + listener.onTaskEnd(SparkListenerTaskEnd( + stageId = 0, + stageAttemptId = 0, + taskType = "", + reason = null, + createTaskInfo(0, 0), + createTaskMetrics(accumulatorUpdates.mapValues(_ * 100)))) + + assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 2)) + + // Finish two tasks + listener.onTaskEnd(SparkListenerTaskEnd( + stageId = 0, + stageAttemptId = 1, + taskType = "", + reason = null, + createTaskInfo(0, 0), + createTaskMetrics(accumulatorUpdates.mapValues(_ * 2)))) + listener.onTaskEnd(SparkListenerTaskEnd( + stageId = 0, + stageAttemptId = 1, + taskType = "", + reason = null, + createTaskInfo(1, 0), + createTaskMetrics(accumulatorUpdates.mapValues(_ * 3)))) + + assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 5)) + + // Summit a new stage + listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(1, 0))) + + listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq( + // (task id, stage id, stage attempt, metrics) + (0L, 1, 0, createTaskMetrics(accumulatorUpdates)), + (1L, 1, 0, createTaskMetrics(accumulatorUpdates)) + ))) + + assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 7)) + + // Finish two tasks + listener.onTaskEnd(SparkListenerTaskEnd( + stageId = 1, + stageAttemptId = 0, + taskType = "", + reason = null, + createTaskInfo(0, 0), + createTaskMetrics(accumulatorUpdates.mapValues(_ * 3)))) + listener.onTaskEnd(SparkListenerTaskEnd( + stageId = 1, + stageAttemptId = 0, + taskType = "", + reason = null, + createTaskInfo(1, 0), + createTaskMetrics(accumulatorUpdates.mapValues(_ * 3)))) + + assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 11)) + + assert(executionUIData.runningJobs === Seq(0)) + assert(executionUIData.succeededJobs.isEmpty) + assert(executionUIData.failedJobs.isEmpty) + + listener.onJobEnd(SparkListenerJobEnd( + jobId = 0, + time = System.currentTimeMillis(), + JobSucceeded + )) + listener.onExecutionEnd(executionId, System.currentTimeMillis()) + + assert(executionUIData.runningJobs.isEmpty) + assert(executionUIData.succeededJobs === Seq(0)) + assert(executionUIData.failedJobs.isEmpty) + + assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 11)) + } + + test("onExecutionEnd happens before onJobEnd(JobSucceeded)") { + val listener = new SQLListener(ctx) + val executionId = 0 + val df = createTestDataFrame + listener.onExecutionStart( + executionId, + "test", + "test", + df.queryExecution.toString, + SparkPlanGraph(df.queryExecution.executedPlan), + System.currentTimeMillis()) + listener.onJobStart(SparkListenerJobStart( + jobId = 0, + time = System.currentTimeMillis(), + stageInfos = Nil, + createProperties(executionId))) + listener.onExecutionEnd(executionId, System.currentTimeMillis()) + listener.onJobEnd(SparkListenerJobEnd( + jobId = 0, + time = System.currentTimeMillis(), + JobSucceeded + )) + + val executionUIData = listener.executionIdToData(0) + assert(executionUIData.runningJobs.isEmpty) + assert(executionUIData.succeededJobs === Seq(0)) + assert(executionUIData.failedJobs.isEmpty) + } + + test("onExecutionEnd happens before multiple onJobEnd(JobSucceeded)s") { + val listener = new SQLListener(ctx) + val executionId = 0 + val df = createTestDataFrame + listener.onExecutionStart( + executionId, + "test", + "test", + df.queryExecution.toString, + SparkPlanGraph(df.queryExecution.executedPlan), + System.currentTimeMillis()) + listener.onJobStart(SparkListenerJobStart( + jobId = 0, + time = System.currentTimeMillis(), + stageInfos = Nil, + createProperties(executionId))) + listener.onJobEnd(SparkListenerJobEnd( + jobId = 0, + time = System.currentTimeMillis(), + JobSucceeded + )) + + listener.onJobStart(SparkListenerJobStart( + jobId = 1, + time = System.currentTimeMillis(), + stageInfos = Nil, + createProperties(executionId))) + listener.onExecutionEnd(executionId, System.currentTimeMillis()) + listener.onJobEnd(SparkListenerJobEnd( + jobId = 1, + time = System.currentTimeMillis(), + JobSucceeded + )) + + val executionUIData = listener.executionIdToData(0) + assert(executionUIData.runningJobs.isEmpty) + assert(executionUIData.succeededJobs.sorted === Seq(0, 1)) + assert(executionUIData.failedJobs.isEmpty) + } + + test("onExecutionEnd happens before onJobEnd(JobFailed)") { + val listener = new SQLListener(ctx) + val executionId = 0 + val df = createTestDataFrame + listener.onExecutionStart( + executionId, + "test", + "test", + df.queryExecution.toString, + SparkPlanGraph(df.queryExecution.executedPlan), + System.currentTimeMillis()) + listener.onJobStart(SparkListenerJobStart( + jobId = 0, + time = System.currentTimeMillis(), + stageInfos = Seq.empty, + createProperties(executionId))) + listener.onExecutionEnd(executionId, System.currentTimeMillis()) + listener.onJobEnd(SparkListenerJobEnd( + jobId = 0, + time = System.currentTimeMillis(), + JobFailed(new RuntimeException("Oops")) + )) + + val executionUIData = listener.executionIdToData(0) + assert(executionUIData.runningJobs.isEmpty) + assert(executionUIData.succeededJobs.isEmpty) + assert(executionUIData.failedJobs === Seq(0)) + } + + ignore("no memory leak") { + val conf = new SparkConf() + .setMaster("local") + .setAppName("test") + .set("spark.task.maxFailures", "1") // Don't retry the tasks to run this test quickly + .set("spark.sql.ui.retainedExecutions", "50") // Set it to 50 to run this test quickly + val sc = new SparkContext(conf) + try { + val sqlContext = new SQLContext(sc) + import sqlContext.implicits._ + // Run 100 successful executions and 100 failed executions. + // Each execution only has one job and one stage. + for (i <- 0 until 100) { + val df = Seq( + (1, 1), + (2, 2) + ).toDF() + df.collect() + try { + df.foreach(_ => throw new RuntimeException("Oops")) + } catch { + case e: SparkException => // This is expected for a failed job + } + } + sc.listenerBus.waitUntilEmpty(10000) + assert(sqlContext.listener.getCompletedExecutions.size <= 50) + assert(sqlContext.listener.getFailedExecutions.size <= 50) + // 50 for successful executions and 50 for failed executions + assert(sqlContext.listener.executionIdToData.size <= 100) + assert(sqlContext.listener.jobIdToExecutionId.size <= 100) + assert(sqlContext.listener.stageIdToStageMetrics.size <= 100) + } finally { + sc.stop() + } + } + +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index 42f2449afb0f..0edac0848c3b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -25,10 +25,13 @@ import org.h2.jdbc.JdbcSQLException import org.scalatest.BeforeAndAfter import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.util.Utils -class JDBCSuite extends SparkFunSuite with BeforeAndAfter { +class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext { + import testImplicits._ + val url = "jdbc:h2:mem:testdb0" val urlWithUserAndPass = "jdbc:h2:mem:testdb0;user=testUser;password=testPass" var conn: java.sql.Connection = null @@ -42,10 +45,6 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter { Some(StringType) } - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ - import ctx.sql - before { Utils.classForName("org.h2.Driver") // Extra properties that will be specified for our database. We need these to test @@ -444,5 +443,4 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter { assert(agg.getCatalystType(0, "", 1, null) === Some(LongType)) assert(agg.getCatalystType(1, "", 1, null) === Some(StringType)) } - } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala index 84b52ca2c733..5dc3a2c07b8c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala @@ -23,11 +23,13 @@ import java.util.Properties import org.scalatest.BeforeAndAfter import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.{SaveMode, Row} +import org.apache.spark.sql.{Row, SaveMode} +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.util.Utils -class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter { +class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext { + val url = "jdbc:h2:mem:testdb2" var conn: java.sql.Connection = null val url1 = "jdbc:h2:mem:testdb3" @@ -37,10 +39,6 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter { properties.setProperty("password", "testPass") properties.setProperty("rowId", "false") - private lazy val ctx = org.apache.spark.sql.test.TestSQLContext - import ctx.implicits._ - import ctx.sql - before { Utils.classForName("org.h2.Driver") conn = DriverManager.getConnection(url) @@ -58,14 +56,14 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter { "create table test.people1 (name TEXT(32) NOT NULL, theid INTEGER NOT NULL)").executeUpdate() conn1.commit() - ctx.sql( + sql( s""" |CREATE TEMPORARY TABLE PEOPLE |USING org.apache.spark.sql.jdbc |OPTIONS (url '$url1', dbtable 'TEST.PEOPLE', user 'testUser', password 'testPass') """.stripMargin.replaceAll("\n", " ")) - ctx.sql( + sql( s""" |CREATE TEMPORARY TABLE PEOPLE1 |USING org.apache.spark.sql.jdbc @@ -144,14 +142,14 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter { } test("INSERT to JDBC Datasource") { - ctx.sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE") + sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE") assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).count) assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length) } test("INSERT to JDBC Datasource with overwrite") { - ctx.sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE") - ctx.sql("INSERT OVERWRITE TABLE PEOPLE1 SELECT * FROM PEOPLE") + sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE") + sql("INSERT OVERWRITE TABLE PEOPLE1 SELECT * FROM PEOPLE") assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).count) assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/optimizer/FilterNullsInJoinKeySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/optimizer/FilterNullsInJoinKeySuite.scala deleted file mode 100644 index f98e4acafbf2..000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/optimizer/FilterNullsInJoinKeySuite.scala +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.optimizer - -import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries -import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.expressions.{Not, AtLeastNNulls} -import org.apache.spark.sql.catalyst.optimizer._ -import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, LogicalPlan} -import org.apache.spark.sql.catalyst.rules.RuleExecutor -import org.apache.spark.sql.test.TestSQLContext - -/** This is the test suite for FilterNullsInJoinKey optimization rule. */ -class FilterNullsInJoinKeySuite extends PlanTest { - - // We add predicate pushdown rules at here to make sure we do not - // create redundant Filter operators. Also, because the attribute ordering of - // the Project operator added by ColumnPruning may be not deterministic - // (the ordering may depend on the testing environment), - // we first construct the plan with expected Filter operators and then - // run the optimizer to add the the Project for column pruning. - object Optimize extends RuleExecutor[LogicalPlan] { - val batches = - Batch("Subqueries", Once, - EliminateSubQueries) :: - Batch("Operator Optimizations", FixedPoint(100), - FilterNullsInJoinKey(TestSQLContext), // This is the rule we test in this suite. - CombineFilters, - PushPredicateThroughProject, - BooleanSimplification, - PushPredicateThroughJoin, - PushPredicateThroughGenerate, - ColumnPruning, - ProjectCollapsing) :: Nil - } - - val leftRelation = LocalRelation('a.int, 'b.int, 'c.int, 'd.int) - - val rightRelation = LocalRelation('e.int, 'f.int, 'g.int, 'h.int) - - test("inner join") { - val joinCondition = - ('a === 'e && 'b + 1 === 'f) && ('d > 'h || 'd === 'g) - - val joinedPlan = - leftRelation - .join(rightRelation, Inner, Some(joinCondition)) - .select('a, 'f, 'd, 'h) - - val optimized = Optimize.execute(joinedPlan.analyze) - - // For an inner join, FilterNullsInJoinKey add filter to both side. - val correctLeft = - leftRelation - .where(!(AtLeastNNulls(1, 'a.expr :: Nil))) - - val correctRight = - rightRelation.where(!(AtLeastNNulls(1, 'e.expr :: 'f.expr :: Nil))) - - val correctAnswer = - correctLeft - .join(correctRight, Inner, Some(joinCondition)) - .select('a, 'f, 'd, 'h) - - comparePlans(optimized, Optimize.execute(correctAnswer.analyze)) - } - - test("make sure we do not keep adding filters") { - val thirdRelation = LocalRelation('i.int, 'j.int, 'k.int, 'l.int) - val joinedPlan = - leftRelation - .join(rightRelation, Inner, Some('a === 'e)) - .join(thirdRelation, Inner, Some('b === 'i && 'a === 'j)) - - val optimized = Optimize.execute(joinedPlan.analyze) - val conditions = optimized.collect { - case Filter(condition @ Not(AtLeastNNulls(1, exprs)), _) => exprs - } - - // Make sure that we have three Not(AtLeastNNulls(1, exprs)) for those three tables. - assert(conditions.length === 3) - - // Make sure attribtues are indeed a, b, e, i, and j. - assert( - conditions.flatMap(exprs => exprs).toSet === - joinedPlan.select('a, 'b, 'e, 'i, 'j).analyze.output.toSet) - } - - test("inner join (partially optimized)") { - val joinCondition = - ('a + 2 === 'e && 'b + 1 === 'f) && ('d > 'h || 'd === 'g) - - val joinedPlan = - leftRelation - .join(rightRelation, Inner, Some(joinCondition)) - .select('a, 'f, 'd, 'h) - - val optimized = Optimize.execute(joinedPlan.analyze) - - // We cannot extract attribute from the left join key. - val correctRight = - rightRelation.where(!(AtLeastNNulls(1, 'e.expr :: 'f.expr :: Nil))) - - val correctAnswer = - leftRelation - .join(correctRight, Inner, Some(joinCondition)) - .select('a, 'f, 'd, 'h) - - comparePlans(optimized, Optimize.execute(correctAnswer.analyze)) - } - - test("inner join (not optimized)") { - val nonOptimizedJoinConditions = - Some('c - 100 + 'd === 'g + 1 - 'h) :: - Some('d > 'h || 'c === 'g) :: - Some('d + 'g + 'c > 'd - 'h) :: Nil - - nonOptimizedJoinConditions.foreach { joinCondition => - val joinedPlan = - leftRelation - .join(rightRelation.select('f, 'g, 'h), Inner, joinCondition) - .select('a, 'c, 'f, 'd, 'h, 'g) - - val optimized = Optimize.execute(joinedPlan.analyze) - - comparePlans(optimized, Optimize.execute(joinedPlan.analyze)) - } - } - - test("left outer join") { - val joinCondition = - ('a === 'e && 'b + 1 === 'f) && ('d > 'h || 'd === 'g) - - val joinedPlan = - leftRelation - .join(rightRelation, LeftOuter, Some(joinCondition)) - .select('a, 'f, 'd, 'h) - - val optimized = Optimize.execute(joinedPlan.analyze) - - // For a left outer join, FilterNullsInJoinKey add filter to the right side. - val correctRight = - rightRelation.where(!(AtLeastNNulls(1, 'e.expr :: 'f.expr :: Nil))) - - val correctAnswer = - leftRelation - .join(correctRight, LeftOuter, Some(joinCondition)) - .select('a, 'f, 'd, 'h) - - comparePlans(optimized, Optimize.execute(correctAnswer.analyze)) - } - - test("right outer join") { - val joinCondition = - ('a === 'e && 'b + 1 === 'f) && ('d > 'h || 'd === 'g) - - val joinedPlan = - leftRelation - .join(rightRelation, RightOuter, Some(joinCondition)) - .select('a, 'f, 'd, 'h) - - val optimized = Optimize.execute(joinedPlan.analyze) - - // For a right outer join, FilterNullsInJoinKey add filter to the left side. - val correctLeft = - leftRelation - .where(!(AtLeastNNulls(1, 'a.expr :: Nil))) - - val correctAnswer = - correctLeft - .join(rightRelation, RightOuter, Some(joinCondition)) - .select('a, 'f, 'd, 'h) - - - comparePlans(optimized, Optimize.execute(correctAnswer.analyze)) - } - - test("full outer join") { - val joinCondition = - ('a === 'e && 'b + 1 === 'f) && ('d > 'h || 'd === 'g) - - val joinedPlan = - leftRelation - .join(rightRelation, FullOuter, Some(joinCondition)) - .select('a, 'f, 'd, 'h) - - // FilterNullsInJoinKey does not fire for a full outer join. - val optimized = Optimize.execute(joinedPlan.analyze) - - comparePlans(optimized, Optimize.execute(joinedPlan.analyze)) - } - - test("left semi join") { - val joinCondition = - ('a === 'e && 'b + 1 === 'f) && ('d > 'h || 'd === 'g) - - val joinedPlan = - leftRelation - .join(rightRelation, LeftSemi, Some(joinCondition)) - .select('a, 'd) - - val optimized = Optimize.execute(joinedPlan.analyze) - - // For a left semi join, FilterNullsInJoinKey add filter to both side. - val correctLeft = - leftRelation - .where(!(AtLeastNNulls(1, 'a.expr :: Nil))) - - val correctRight = - rightRelation.where(!(AtLeastNNulls(1, 'e.expr :: 'f.expr :: Nil))) - - val correctAnswer = - correctLeft - .join(correctRight, LeftSemi, Some(joinCondition)) - .select('a, 'd) - - comparePlans(optimized, Optimize.execute(correctAnswer.analyze)) - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala deleted file mode 100644 index bfa427349ff6..000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.parquet - -import java.nio.ByteBuffer -import java.util.{List => JList, Map => JMap} - -import scala.collection.JavaConversions._ - -import org.apache.hadoop.fs.Path -import org.apache.parquet.avro.AvroParquetWriter - -import org.apache.spark.sql.parquet.test.avro.{Nested, ParquetAvroCompat} -import org.apache.spark.sql.test.TestSQLContext -import org.apache.spark.sql.{Row, SQLContext} - -class ParquetAvroCompatibilitySuite extends ParquetCompatibilityTest { - import ParquetCompatibilityTest._ - - override val sqlContext: SQLContext = TestSQLContext - - override protected def beforeAll(): Unit = { - super.beforeAll() - - val writer = - new AvroParquetWriter[ParquetAvroCompat]( - new Path(parquetStore.getCanonicalPath), - ParquetAvroCompat.getClassSchema) - - (0 until 10).foreach(i => writer.write(makeParquetAvroCompat(i))) - writer.close() - } - - test("Read Parquet file generated by parquet-avro") { - logInfo( - s"""Schema of the Parquet file written by parquet-avro: - |${readParquetSchema(parquetStore.getCanonicalPath)} - """.stripMargin) - - checkAnswer(sqlContext.read.parquet(parquetStore.getCanonicalPath), (0 until 10).map { i => - def nullable[T <: AnyRef]: ( => T) => T = makeNullable[T](i) - - Row( - i % 2 == 0, - i, - i.toLong * 10, - i.toFloat + 0.1f, - i.toDouble + 0.2d, - s"val_$i".getBytes, - s"val_$i", - - nullable(i % 2 == 0: java.lang.Boolean), - nullable(i: Integer), - nullable(i.toLong: java.lang.Long), - nullable(i.toFloat + 0.1f: java.lang.Float), - nullable(i.toDouble + 0.2d: java.lang.Double), - nullable(s"val_$i".getBytes), - nullable(s"val_$i"), - - Seq.tabulate(3)(n => s"arr_${i + n}"), - Seq.tabulate(3)(n => n.toString -> (i + n: Integer)).toMap, - Seq.tabulate(3) { n => - (i + n).toString -> Seq.tabulate(3) { m => - Row(Seq.tabulate(3)(j => i + j + m), s"val_${i + m}") - } - }.toMap) - }) - } - - def makeParquetAvroCompat(i: Int): ParquetAvroCompat = { - def nullable[T <: AnyRef] = makeNullable[T](i) _ - - def makeComplexColumn(i: Int): JMap[String, JList[Nested]] = { - mapAsJavaMap(Seq.tabulate(3) { n => - (i + n).toString -> seqAsJavaList(Seq.tabulate(3) { m => - Nested - .newBuilder() - .setNestedIntsColumn(seqAsJavaList(Seq.tabulate(3)(j => i + j + m))) - .setNestedStringColumn(s"val_${i + m}") - .build() - }) - }.toMap) - } - - ParquetAvroCompat - .newBuilder() - .setBoolColumn(i % 2 == 0) - .setIntColumn(i) - .setLongColumn(i.toLong * 10) - .setFloatColumn(i.toFloat + 0.1f) - .setDoubleColumn(i.toDouble + 0.2d) - .setBinaryColumn(ByteBuffer.wrap(s"val_$i".getBytes)) - .setStringColumn(s"val_$i") - - .setMaybeBoolColumn(nullable(i % 2 == 0: java.lang.Boolean)) - .setMaybeIntColumn(nullable(i: Integer)) - .setMaybeLongColumn(nullable(i.toLong: java.lang.Long)) - .setMaybeFloatColumn(nullable(i.toFloat + 0.1f: java.lang.Float)) - .setMaybeDoubleColumn(nullable(i.toDouble + 0.2d: java.lang.Double)) - .setMaybeBinaryColumn(nullable(ByteBuffer.wrap(s"val_$i".getBytes))) - .setMaybeStringColumn(nullable(s"val_$i")) - - .setStringsColumn(Seq.tabulate(3)(n => s"arr_${i + n}")) - .setStringToIntColumn( - mapAsJavaMap(Seq.tabulate(3)(n => n.toString -> (i + n: Integer)).toMap)) - .setComplexColumn(makeComplexColumn(i)) - - .build() - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala deleted file mode 100644 index a95f70f2bba6..000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.parquet - -import java.io.File - -import org.apache.hadoop.fs.Path - -import org.apache.spark.sql.types._ -import org.apache.spark.sql.{QueryTest, Row, SQLConf} -import org.apache.spark.util.Utils - -/** - * A test suite that tests various Parquet queries. - */ -class ParquetQuerySuite extends QueryTest with ParquetTest { - lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext - import sqlContext.sql - - test("simple select queries") { - withParquetTable((0 until 10).map(i => (i, i.toString)), "t") { - checkAnswer(sql("SELECT _1 FROM t where t._1 > 5"), (6 until 10).map(Row.apply(_))) - checkAnswer(sql("SELECT _1 FROM t as tmp where tmp._1 < 5"), (0 until 5).map(Row.apply(_))) - } - } - - test("appending") { - val data = (0 until 10).map(i => (i, i.toString)) - sqlContext.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp") - withParquetTable(data, "t") { - sql("INSERT INTO TABLE t SELECT * FROM tmp") - checkAnswer(sqlContext.table("t"), (data ++ data).map(Row.fromTuple)) - } - sqlContext.catalog.unregisterTable(Seq("tmp")) - } - - test("overwriting") { - val data = (0 until 10).map(i => (i, i.toString)) - sqlContext.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp") - withParquetTable(data, "t") { - sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp") - checkAnswer(sqlContext.table("t"), data.map(Row.fromTuple)) - } - sqlContext.catalog.unregisterTable(Seq("tmp")) - } - - test("self-join") { - // 4 rows, cells of column 1 of row 2 and row 4 are null - val data = (1 to 4).map { i => - val maybeInt = if (i % 2 == 0) None else Some(i) - (maybeInt, i.toString) - } - - withParquetTable(data, "t") { - val selfJoin = sql("SELECT * FROM t x JOIN t y WHERE x._1 = y._1") - val queryOutput = selfJoin.queryExecution.analyzed.output - - assertResult(4, "Field count mismatches")(queryOutput.size) - assertResult(2, "Duplicated expression ID in query plan:\n $selfJoin") { - queryOutput.filter(_.name == "_1").map(_.exprId).size - } - - checkAnswer(selfJoin, List(Row(1, "1", 1, "1"), Row(3, "3", 3, "3"))) - } - } - - test("nested data - struct with array field") { - val data = (1 to 10).map(i => Tuple1((i, Seq("val_$i")))) - withParquetTable(data, "t") { - checkAnswer(sql("SELECT _1._2[0] FROM t"), data.map { - case Tuple1((_, Seq(string))) => Row(string) - }) - } - } - - test("nested data - array of struct") { - val data = (1 to 10).map(i => Tuple1(Seq(i -> "val_$i"))) - withParquetTable(data, "t") { - checkAnswer(sql("SELECT _1[0]._2 FROM t"), data.map { - case Tuple1(Seq((_, string))) => Row(string) - }) - } - } - - test("SPARK-1913 regression: columns only referenced by pushed down filters should remain") { - withParquetTable((1 to 10).map(Tuple1.apply), "t") { - checkAnswer(sql("SELECT _1 FROM t WHERE _1 < 10"), (1 to 9).map(Row.apply(_))) - } - } - - test("SPARK-5309 strings stored using dictionary compression in parquet") { - withParquetTable((0 until 1000).map(i => ("same", "run_" + i /100, 1)), "t") { - - checkAnswer(sql("SELECT _1, _2, SUM(_3) FROM t GROUP BY _1, _2"), - (0 until 10).map(i => Row("same", "run_" + i, 100))) - - checkAnswer(sql("SELECT _1, _2, SUM(_3) FROM t WHERE _2 = 'run_5' GROUP BY _1, _2"), - List(Row("same", "run_5", 100))) - } - } - - test("SPARK-6917 DecimalType should work with non-native types") { - val data = (1 to 10).map(i => Row(Decimal(i, 18, 0), new java.sql.Timestamp(i))) - val schema = StructType(List(StructField("d", DecimalType(18, 0), false), - StructField("time", TimestampType, false)).toArray) - withTempPath { file => - val df = sqlContext.createDataFrame(sqlContext.sparkContext.parallelize(data), schema) - df.write.parquet(file.getCanonicalPath) - val df2 = sqlContext.read.parquet(file.getCanonicalPath) - checkAnswer(df2, df.collect().toSeq) - } - } - - test("Enabling/disabling merging partfiles when merging parquet schema") { - def testSchemaMerging(expectedColumnNumber: Int): Unit = { - withTempDir { dir => - val basePath = dir.getCanonicalPath - sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) - sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString) - // delete summary files, so if we don't merge part-files, one column will not be included. - Utils.deleteRecursively(new File(basePath + "/foo=1/_metadata")) - Utils.deleteRecursively(new File(basePath + "/foo=1/_common_metadata")) - assert(sqlContext.read.parquet(basePath).columns.length === expectedColumnNumber) - } - } - - withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true", - SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES.key -> "true") { - testSchemaMerging(2) - } - - withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true", - SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES.key -> "false") { - testSchemaMerging(3) - } - } - - test("Enabling/disabling schema merging") { - def testSchemaMerging(expectedColumnNumber: Int): Unit = { - withTempDir { dir => - val basePath = dir.getCanonicalPath - sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) - sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString) - assert(sqlContext.read.parquet(basePath).columns.length === expectedColumnNumber) - } - } - - withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true") { - testSchemaMerging(3) - } - - withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "false") { - testSchemaMerging(2) - } - } - - test("SPARK-8990 DataFrameReader.parquet() should respect user specified options") { - withTempPath { dir => - val basePath = dir.getCanonicalPath - sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) - sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=a").toString) - - // Disables the global SQL option for schema merging - withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "false") { - assertResult(2) { - // Disables schema merging via data source option - sqlContext.read.option("mergeSchema", "false").parquet(basePath).columns.length - } - - assertResult(3) { - // Enables schema merging via data source option - sqlContext.read.option("mergeSchema", "true").parquet(basePath).columns.length - } - } - } - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala index 1907e643c85d..9bc3f6bcf6fc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala @@ -19,28 +19,32 @@ package org.apache.spark.sql.sources import java.io.{File, IOException} -import org.scalatest.BeforeAndAfterAll +import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.DDLException +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils -class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { - - import caseInsensitiveContext.sql +class CreateTableAsSelectSuite extends DataSourceTest with SharedSQLContext with BeforeAndAfter { + protected override lazy val sql = caseInsensitiveContext.sql _ private lazy val sparkContext = caseInsensitiveContext.sparkContext - - var path: File = null + private var path: File = null override def beforeAll(): Unit = { + super.beforeAll() path = Utils.createTempDir() val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}""")) caseInsensitiveContext.read.json(rdd).registerTempTable("jt") } override def afterAll(): Unit = { - caseInsensitiveContext.dropTempTable("jt") + try { + caseInsensitiveContext.dropTempTable("jt") + } finally { + super.afterAll() + } } after { @@ -51,7 +55,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { sql( s""" |CREATE TEMPORARY TABLE jsonTable - |USING org.apache.spark.sql.json.DefaultSource + |USING json |OPTIONS ( | path '${path.toString}' |) AS @@ -75,7 +79,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { sql( s""" |CREATE TEMPORARY TABLE jsonTable - |USING org.apache.spark.sql.json.DefaultSource + |USING json |OPTIONS ( | path '${path.toString}' |) AS @@ -92,7 +96,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { sql( s""" |CREATE TEMPORARY TABLE jsonTable - |USING org.apache.spark.sql.json.DefaultSource + |USING json |OPTIONS ( | path '${path.toString}' |) AS @@ -107,7 +111,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { sql( s""" |CREATE TEMPORARY TABLE IF NOT EXISTS jsonTable - |USING org.apache.spark.sql.json.DefaultSource + |USING json |OPTIONS ( | path '${path.toString}' |) AS @@ -122,7 +126,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { sql( s""" |CREATE TEMPORARY TABLE jsonTable - |USING org.apache.spark.sql.json.DefaultSource + |USING json |OPTIONS ( | path '${path.toString}' |) AS @@ -139,7 +143,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { sql( s""" |CREATE TEMPORARY TABLE jsonTable - |USING org.apache.spark.sql.json.DefaultSource + |USING json |OPTIONS ( | path '${path.toString}' |) AS @@ -158,7 +162,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { sql( s""" |CREATE TEMPORARY TABLE IF NOT EXISTS jsonTable - |USING org.apache.spark.sql.json.DefaultSource + |USING json |OPTIONS ( | path '${path.toString}' |) AS @@ -175,7 +179,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { sql( s""" |CREATE TEMPORARY TABLE jsonTable (a int, b string) - |USING org.apache.spark.sql.json.DefaultSource + |USING json |OPTIONS ( | path '${path.toString}' |) AS @@ -188,7 +192,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { sql( s""" |CREATE TEMPORARY TABLE jsonTable - |USING org.apache.spark.sql.json.DefaultSource + |USING json |OPTIONS ( | path '${path.toString}' |) AS @@ -199,7 +203,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { sql( s""" |CREATE TEMPORARY TABLE jsonTable - |USING org.apache.spark.sql.json.DefaultSource + |USING json |OPTIONS ( | path '${path.toString}' |) AS diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala new file mode 100644 index 000000000000..853707c036c9 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala @@ -0,0 +1,89 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql.sources + +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types.{StringType, StructField, StructType} + + +// please note that the META-INF/services had to be modified for the test directory for this to work +class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext { + + test("data sources with the same name") { + intercept[RuntimeException] { + caseInsensitiveContext.read.format("Fluet da Bomb").load() + } + } + + test("load data source from format alias") { + caseInsensitiveContext.read.format("gathering quorum").load().schema == + StructType(Seq(StructField("stringType", StringType, nullable = false))) + } + + test("specify full classname with duplicate formats") { + caseInsensitiveContext.read.format("org.apache.spark.sql.sources.FakeSourceOne") + .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) + } + + test("should fail to load ORC without HiveContext") { + intercept[ClassNotFoundException] { + caseInsensitiveContext.read.format("orc").load() + } + } +} + + +class FakeSourceOne extends RelationProvider with DataSourceRegister { + + def shortName(): String = "Fluet da Bomb" + + override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = + new BaseRelation { + override def sqlContext: SQLContext = cont + + override def schema: StructType = + StructType(Seq(StructField("stringType", StringType, nullable = false))) + } +} + +class FakeSourceTwo extends RelationProvider with DataSourceRegister { + + def shortName(): String = "Fluet da Bomb" + + override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = + new BaseRelation { + override def sqlContext: SQLContext = cont + + override def schema: StructType = + StructType(Seq(StructField("stringType", StringType, nullable = false))) + } +} + +class FakeSourceThree extends RelationProvider with DataSourceRegister { + + def shortName(): String = "gathering quorum" + + override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = + new BaseRelation { + override def sqlContext: SQLContext = cont + + override def schema: StructType = + StructType(Seq(StructField("stringType", StringType, nullable = false))) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala index 84855ce45e91..5f8514e1a241 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.sources import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -68,10 +69,12 @@ case class SimpleDDLScan(from: Int, to: Int, table: String)(@transient val sqlCo } } -class DDLTestSuite extends DataSourceTest { +class DDLTestSuite extends DataSourceTest with SharedSQLContext { + protected override lazy val sql = caseInsensitiveContext.sql _ - before { - caseInsensitiveContext.sql( + override def beforeAll(): Unit = { + super.beforeAll() + sql( """ |CREATE TEMPORARY TABLE ddlPeople |USING org.apache.spark.sql.sources.DDLScanSource @@ -105,7 +108,7 @@ class DDLTestSuite extends DataSourceTest { )) test("SPARK-7686 DescribeCommand should have correct physical plan output attributes") { - val attributes = caseInsensitiveContext.sql("describe ddlPeople") + val attributes = sql("describe ddlPeople") .queryExecution.executedPlan.output assert(attributes.map(_.name) === Seq("col_name", "data_type", "comment")) assert(attributes.map(_.dataType).toSet === Set(StringType)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala index 00cc7d5ea580..d74d29fb0beb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala @@ -17,18 +17,23 @@ package org.apache.spark.sql.sources -import org.scalatest.BeforeAndAfter - import org.apache.spark.sql._ -import org.apache.spark.sql.test.TestSQLContext -abstract class DataSourceTest extends QueryTest with BeforeAndAfter { +private[sql] abstract class DataSourceTest extends QueryTest { + protected def _sqlContext: SQLContext + // We want to test some edge cases. - protected implicit lazy val caseInsensitiveContext = { - val ctx = new SQLContext(TestSQLContext.sparkContext) + protected lazy val caseInsensitiveContext: SQLContext = { + val ctx = new SQLContext(_sqlContext.sparkContext) ctx.setConf(SQLConf.CASE_SENSITIVE, false) ctx } + protected def sqlTest(sqlString: String, expectedAnswer: Seq[Row]) { + test(sqlString) { + checkAnswer(caseInsensitiveContext.sql(sqlString), expectedAnswer) + } + } + } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala index 81b3a0f0c5b3..68ce37c00077 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala @@ -20,7 +20,9 @@ package org.apache.spark.sql.sources import scala.language.existentials import org.apache.spark.rdd.RDD +import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.sql._ +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ @@ -56,6 +58,7 @@ case class SimpleFilteredScan(from: Int, to: Int)(@transient val sqlContext: SQL // Predicate test on integer column def translateFilterOnA(filter: Filter): Int => Boolean = filter match { case EqualTo("a", v) => (a: Int) => a == v + case EqualNullSafe("a", v) => (a: Int) => a == v case LessThan("a", v: Int) => (a: Int) => a < v case LessThanOrEqual("a", v: Int) => (a: Int) => a <= v case GreaterThan("a", v: Int) => (a: Int) => a > v @@ -76,6 +79,9 @@ case class SimpleFilteredScan(from: Int, to: Int)(@transient val sqlContext: SQL case StringStartsWith("c", v) => _.startsWith(v) case StringEndsWith("c", v) => _.endsWith(v) case StringContains("c", v) => _.contains(v) + case EqualTo("c", v: String) => _.equals(v) + case EqualTo("c", v: UTF8String) => sys.error("UTF8String should not appear in filters") + case In("c", values) => (s: String) => values.map(_.asInstanceOf[String]).toSet.contains(s) case _ => (c: String) => true } @@ -95,11 +101,11 @@ object FiltersPushed { var list: Seq[Filter] = Nil } -class FilteredScanSuite extends DataSourceTest { +class FilteredScanSuite extends DataSourceTest with SharedSQLContext { + protected override lazy val sql = caseInsensitiveContext.sql _ - import caseInsensitiveContext.sql - - before { + override def beforeAll(): Unit = { + super.beforeAll() sql( """ |CREATE TEMPORARY TABLE oneToTenFiltered @@ -235,6 +241,9 @@ class FilteredScanSuite extends DataSourceTest { testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%eE%'", 1) testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%Ee%'", 0) + testPushDown("SELECT c FROM oneToTenFiltered WHERE c = 'aaaaaAAAAA'", 1) + testPushDown("SELECT c FROM oneToTenFiltered WHERE c IN ('aaaaaAAAAA', 'foo')", 1) + def testPushDown(sqlString: String, expectedCount: Int): Unit = { test(s"PushDown Returns $expectedCount: $sqlString") { val queryExecution = sql(sqlString).queryExecution diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index 0b7c46c482c8..084d83f6e9bf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -19,22 +19,19 @@ package org.apache.spark.sql.sources import java.io.File -import org.scalatest.BeforeAndAfterAll - import org.apache.spark.sql.{SaveMode, AnalysisException, Row} +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils -class InsertSuite extends DataSourceTest with BeforeAndAfterAll { - - import caseInsensitiveContext.sql - +class InsertSuite extends DataSourceTest with SharedSQLContext { + protected override lazy val sql = caseInsensitiveContext.sql _ private lazy val sparkContext = caseInsensitiveContext.sparkContext + private var path: File = null - var path: File = null - - override def beforeAll: Unit = { + override def beforeAll(): Unit = { + super.beforeAll() path = Utils.createTempDir() - val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}""")) + val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}""")) caseInsensitiveContext.read.json(rdd).registerTempTable("jt") sql( s""" @@ -46,10 +43,14 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll { """.stripMargin) } - override def afterAll: Unit = { - caseInsensitiveContext.dropTempTable("jsonTable") - caseInsensitiveContext.dropTempTable("jt") - Utils.deleteRecursively(path) + override def afterAll(): Unit = { + try { + caseInsensitiveContext.dropTempTable("jsonTable") + caseInsensitiveContext.dropTempTable("jt") + Utils.deleteRecursively(path) + } finally { + super.afterAll() + } } test("Simple INSERT OVERWRITE a JSONRelation") { @@ -110,7 +111,7 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll { ) // Writing the table to less part files. - val rdd1 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""), 5) + val rdd1 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}"""), 5) caseInsensitiveContext.read.json(rdd1).registerTempTable("jt1") sql( s""" @@ -122,7 +123,7 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll { ) // Writing the table to more part files. - val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""), 10) + val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}"""), 10) caseInsensitiveContext.read.json(rdd2).registerTempTable("jt2") sql( s""" @@ -146,27 +147,23 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll { caseInsensitiveContext.dropTempTable("jt2") } - test("INSERT INTO not supported for JSONRelation for now") { - intercept[RuntimeException]{ - sql( - s""" - |INSERT INTO TABLE jsonTable SELECT a, b FROM jt - """.stripMargin) - } - } - - test("save directly to the path of a JSON table") { - caseInsensitiveContext.table("jt").selectExpr("a * 5 as a", "b") - .write.mode(SaveMode.Overwrite).json(path.toString) + test("INSERT INTO JSONRelation for now") { + sql( + s""" + |INSERT OVERWRITE TABLE jsonTable SELECT a, b FROM jt + """.stripMargin) checkAnswer( sql("SELECT a, b FROM jsonTable"), - (1 to 10).map(i => Row(i * 5, s"str$i")) + sql("SELECT a, b FROM jt").collect() ) - caseInsensitiveContext.table("jt").write.mode(SaveMode.Overwrite).json(path.toString) + sql( + s""" + |INSERT INTO TABLE jsonTable SELECT a, b FROM jt + """.stripMargin) checkAnswer( sql("SELECT a, b FROM jsonTable"), - (1 to 10).map(i => Row(i, s"str$i")) + sql("SELECT a, b FROM jt UNION ALL SELECT a, b FROM jt").collect() ) } @@ -183,6 +180,11 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll { } test("Caching") { + // write something to the jsonTable + sql( + s""" + |INSERT OVERWRITE TABLE jsonTable SELECT a, b FROM jt + """.stripMargin) // Cached Query Execution caseInsensitiveContext.cacheTable("jsonTable") assertCached(sql("SELECT * FROM jsonTable")) @@ -205,9 +207,10 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll { sql("SELECT a * 2 FROM jsonTable"), (1 to 10).map(i => Row(i * 2)).toSeq) - assertCached(sql("SELECT x.a, y.a FROM jsonTable x JOIN jsonTable y ON x.a = y.a + 1"), 2) - checkAnswer( - sql("SELECT x.a, y.a FROM jsonTable x JOIN jsonTable y ON x.a = y.a + 1"), + assertCached(sql( + "SELECT x.a, y.a FROM jsonTable x JOIN jsonTable y ON x.a = y.a + 1"), 2) + checkAnswer(sql( + "SELECT x.a, y.a FROM jsonTable x JOIN jsonTable y ON x.a = y.a + 1"), (2 to 10).map(i => Row(i, i - 1)).toSeq) // Insert overwrite and keep the same schema. @@ -217,14 +220,15 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll { """.stripMargin) // jsonTable should be recached. assertCached(sql("SELECT * FROM jsonTable")) - // The cached data is the new data. - checkAnswer( - sql("SELECT a, b FROM jsonTable"), - sql("SELECT a * 2, b FROM jt").collect()) - - // Verify uncaching - caseInsensitiveContext.uncacheTable("jsonTable") - assertCached(sql("SELECT * FROM jsonTable"), 0) + // TODO we need to invalidate the cached data in InsertIntoHadoopFsRelation +// // The cached data is the new data. +// checkAnswer( +// sql("SELECT a, b FROM jsonTable"), +// sql("SELECT a * 2, b FROM jt").collect()) +// +// // Verify uncaching +// caseInsensitiveContext.uncacheTable("jsonTable") +// assertCached(sql("SELECT * FROM jsonTable"), 0) } test("it's not allowed to insert into a relation that is not an InsertableRelation") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala new file mode 100644 index 000000000000..79b6e9b45c00 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.sources + +import org.apache.spark.sql.{Row, QueryTest} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.util.Utils + +class PartitionedWriteSuite extends QueryTest with SharedSQLContext { + import testImplicits._ + + test("write many partitions") { + val path = Utils.createTempDir() + path.delete() + + val df = ctx.range(100).select($"id", lit(1).as("data")) + df.write.partitionBy("id").save(path.getCanonicalPath) + + checkAnswer( + ctx.read.load(path.getCanonicalPath), + (0 to 99).map(Row(1, _)).toSeq) + + Utils.deleteRecursively(path) + } + + test("write many partitions with repeats") { + val path = Utils.createTempDir() + path.delete() + + val base = ctx.range(100) + val df = base.unionAll(base).select($"id", lit(1).as("data")) + df.write.partitionBy("id").save(path.getCanonicalPath) + + checkAnswer( + ctx.read.load(path.getCanonicalPath), + (0 to 99).map(Row(1, _)).toSeq ++ (0 to 99).map(Row(1, _)).toSeq) + + Utils.deleteRecursively(path) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala index 0d5183444af7..a89c5f8007e7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala @@ -21,6 +21,7 @@ import scala.language.existentials import org.apache.spark.rdd.RDD import org.apache.spark.sql._ +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ class PrunedScanSource extends RelationProvider { @@ -51,10 +52,12 @@ case class SimplePrunedScan(from: Int, to: Int)(@transient val sqlContext: SQLCo } } -class PrunedScanSuite extends DataSourceTest { +class PrunedScanSuite extends DataSourceTest with SharedSQLContext { + protected override lazy val sql = caseInsensitiveContext.sql _ - before { - caseInsensitiveContext.sql( + override def beforeAll(): Unit = { + super.beforeAll() + sql( """ |CREATE TEMPORARY TABLE oneToTenPruned |USING org.apache.spark.sql.sources.PrunedScanSource @@ -114,7 +117,7 @@ class PrunedScanSuite extends DataSourceTest { def testPruning(sqlString: String, expectedColumns: String*): Unit = { test(s"Columns output ${expectedColumns.mkString(",")}: $sqlString") { - val queryExecution = caseInsensitiveContext.sql(sqlString).queryExecution + val queryExecution = sql(sqlString).queryExecution val rawPlan = queryExecution.executedPlan.collect { case p: execution.PhysicalRDD => p } match { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala index 3cbf5467b253..27d1cd92fca1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala @@ -22,14 +22,39 @@ import org.apache.spark.sql.execution.datasources.ResolvedDataSource class ResolvedDataSourceSuite extends SparkFunSuite { - test("builtin sources") { - assert(ResolvedDataSource.lookupDataSource("jdbc") === - classOf[org.apache.spark.sql.jdbc.DefaultSource]) + test("jdbc") { + assert( + ResolvedDataSource.lookupDataSource("jdbc") === + classOf[org.apache.spark.sql.execution.datasources.jdbc.DefaultSource]) + assert( + ResolvedDataSource.lookupDataSource("org.apache.spark.sql.execution.datasources.jdbc") === + classOf[org.apache.spark.sql.execution.datasources.jdbc.DefaultSource]) + assert( + ResolvedDataSource.lookupDataSource("org.apache.spark.sql.jdbc") === + classOf[org.apache.spark.sql.execution.datasources.jdbc.DefaultSource]) + } - assert(ResolvedDataSource.lookupDataSource("json") === - classOf[org.apache.spark.sql.json.DefaultSource]) + test("json") { + assert( + ResolvedDataSource.lookupDataSource("json") === + classOf[org.apache.spark.sql.execution.datasources.json.DefaultSource]) + assert( + ResolvedDataSource.lookupDataSource("org.apache.spark.sql.execution.datasources.json") === + classOf[org.apache.spark.sql.execution.datasources.json.DefaultSource]) + assert( + ResolvedDataSource.lookupDataSource("org.apache.spark.sql.json") === + classOf[org.apache.spark.sql.execution.datasources.json.DefaultSource]) + } - assert(ResolvedDataSource.lookupDataSource("parquet") === - classOf[org.apache.spark.sql.parquet.DefaultSource]) + test("parquet") { + assert( + ResolvedDataSource.lookupDataSource("parquet") === + classOf[org.apache.spark.sql.execution.datasources.parquet.DefaultSource]) + assert( + ResolvedDataSource.lookupDataSource("org.apache.spark.sql.execution.datasources.parquet") === + classOf[org.apache.spark.sql.execution.datasources.parquet.DefaultSource]) + assert( + ResolvedDataSource.lookupDataSource("org.apache.spark.sql.parquet") === + classOf[org.apache.spark.sql.execution.datasources.parquet.DefaultSource]) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala index b032515a9d28..f18546b4c2d9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala @@ -19,25 +19,22 @@ package org.apache.spark.sql.sources import java.io.File -import org.scalatest.BeforeAndAfterAll +import org.scalatest.BeforeAndAfter -import org.apache.spark.sql.{SaveMode, SQLConf, DataFrame} +import org.apache.spark.sql.{AnalysisException, SaveMode, SQLConf, DataFrame} +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.util.Utils -class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll { - - import caseInsensitiveContext.sql - +class SaveLoadSuite extends DataSourceTest with SharedSQLContext with BeforeAndAfter { + protected override lazy val sql = caseInsensitiveContext.sql _ private lazy val sparkContext = caseInsensitiveContext.sparkContext - - var originalDefaultSource: String = null - - var path: File = null - - var df: DataFrame = null + private var originalDefaultSource: String = null + private var path: File = null + private var df: DataFrame = null override def beforeAll(): Unit = { + super.beforeAll() originalDefaultSource = caseInsensitiveContext.conf.defaultDataSourceName path = Utils.createTempDir() @@ -49,27 +46,32 @@ class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll { } override def afterAll(): Unit = { - caseInsensitiveContext.conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource) + try { + caseInsensitiveContext.conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource) + } finally { + super.afterAll() + } } after { - caseInsensitiveContext.conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource) Utils.deleteRecursively(path) } - def checkLoad(): Unit = { + def checkLoad(expectedDF: DataFrame = df, tbl: String = "jsonTable"): Unit = { caseInsensitiveContext.conf.setConf( SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json") - checkAnswer(caseInsensitiveContext.read.load(path.toString), df.collect()) + checkAnswer(caseInsensitiveContext.read.load(path.toString), expectedDF.collect()) // Test if we can pick up the data source name passed in load. caseInsensitiveContext.conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name") - checkAnswer(caseInsensitiveContext.read.format("json").load(path.toString), df.collect()) - checkAnswer(caseInsensitiveContext.read.format("json").load(path.toString), df.collect()) + checkAnswer(caseInsensitiveContext.read.format("json").load(path.toString), + expectedDF.collect()) + checkAnswer(caseInsensitiveContext.read.format("json").load(path.toString), + expectedDF.collect()) val schema = StructType(StructField("b", StringType, true) :: Nil) checkAnswer( caseInsensitiveContext.read.format("json").schema(schema).load(path.toString), - sql("SELECT b FROM jsonTable").collect()) + sql(s"SELECT b FROM $tbl").collect()) } test("save with path and load") { @@ -102,7 +104,7 @@ class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll { test("save and save again") { df.write.json(path.toString) - var message = intercept[RuntimeException] { + val message = intercept[AnalysisException] { df.write.json(path.toString) }.getMessage @@ -118,12 +120,11 @@ class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll { df.write.mode(SaveMode.Overwrite).json(path.toString) checkLoad() - message = intercept[RuntimeException] { - df.write.mode(SaveMode.Append).json(path.toString) - }.getMessage + // verify the append mode + df.write.mode(SaveMode.Append).json(path.toString) + val df2 = df.unionAll(df) + df2.registerTempTable("jsonTable2") - assert( - message.contains("Append mode is not supported"), - "We should complain that 'Append mode is not supported' for JSON source.") + checkLoad(df2, "jsonTable2") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala index cfb03ff485b7..12af8068c398 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala @@ -17,14 +17,13 @@ package org.apache.spark.sql.sources +import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} import org.apache.spark.rdd.RDD import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String class DefaultSource extends SimpleScanSource @@ -73,7 +72,7 @@ case class AllDataTypesScan( sqlContext.sparkContext.parallelize(from to to).map { i => Row( s"str_$i", - s"str_$i".getBytes(), + s"str_$i".getBytes(StandardCharsets.UTF_8), i % 2 == 0, i.toByte, i.toShort, @@ -83,7 +82,7 @@ case class AllDataTypesScan( i.toDouble, new java.math.BigDecimal(i), new java.math.BigDecimal(i), - new Date(1970, 1, 1), + Date.valueOf("1970-01-01"), new Timestamp(20000 + i), s"varchar_$i", Seq(i, i + 1), @@ -92,13 +91,13 @@ case class AllDataTypesScan( Map(Map(s"str_$i" -> i.toFloat) -> Row(i.toLong)), Row(i, i.toString), Row(Seq(s"str_$i", s"str_${i + 1}"), - Row(Seq(new Date(1970, 1, i + 1))))) + Row(Seq(Date.valueOf(s"1970-01-${i + 1}"))))) } } } -class TableScanSuite extends DataSourceTest { - import caseInsensitiveContext.sql +class TableScanSuite extends DataSourceTest with SharedSQLContext { + protected override lazy val sql = caseInsensitiveContext.sql _ private lazy val tableWithSchemaExpected = (1 to 10).map { i => Row( @@ -113,7 +112,7 @@ class TableScanSuite extends DataSourceTest { i.toDouble, new java.math.BigDecimal(i), new java.math.BigDecimal(i), - new Date(1970, 1, 1), + Date.valueOf("1970-01-01"), new Timestamp(20000 + i), s"varchar_$i", Seq(i, i + 1), @@ -121,10 +120,11 @@ class TableScanSuite extends DataSourceTest { Map(i -> i.toString), Map(Map(s"str_$i" -> i.toFloat) -> Row(i.toLong)), Row(i, i.toString), - Row(Seq(s"str_$i", s"str_${i + 1}"), Row(Seq(new Date(1970, 1, i + 1))))) + Row(Seq(s"str_$i", s"str_${i + 1}"), Row(Seq(Date.valueOf(s"1970-01-${i + 1}"))))) }.toSeq - before { + override def beforeAll(): Unit = { + super.beforeAll() sql( """ |CREATE TEMPORARY TABLE oneToTen @@ -280,7 +280,7 @@ class TableScanSuite extends DataSourceTest { sqlTest( "SELECT structFieldComplex.Value.`value_(2)` FROM tableWithSchema", - (1 to 10).map(i => Row(Seq(new Date(1970, 1, i + 1)))).toSeq) + (1 to 10).map(i => Row(Seq(Date.valueOf(s"1970-01-${i + 1}")))).toSeq) test("Caching") { // Cached Query Execution @@ -305,9 +305,10 @@ class TableScanSuite extends DataSourceTest { sql("SELECT i * 2 FROM oneToTen"), (1 to 10).map(i => Row(i * 2)).toSeq) - assertCached(sql("SELECT a.i, b.i FROM oneToTen a JOIN oneToTen b ON a.i = b.i + 1"), 2) - checkAnswer( - sql("SELECT a.i, b.i FROM oneToTen a JOIN oneToTen b ON a.i = b.i + 1"), + assertCached(sql( + "SELECT a.i, b.i FROM oneToTen a JOIN oneToTen b ON a.i = b.i + 1"), 2) + checkAnswer(sql( + "SELECT a.i, b.i FROM oneToTen a JOIN oneToTen b ON a.i = b.i + 1"), (2 to 10).map(i => Row(i, i - 1)).toSeq) // Verify uncaching diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/ProcessTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/ProcessTestUtils.scala new file mode 100644 index 000000000000..152c9c8459de --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/ProcessTestUtils.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.test + +import java.io.{IOException, InputStream} + +import scala.sys.process.BasicIO + +object ProcessTestUtils { + class ProcessOutputCapturer(stream: InputStream, capture: String => Unit) extends Thread { + this.setDaemon(true) + + override def run(): Unit = { + try { + BasicIO.processFully(capture)(stream) + } catch { case _: IOException => + // Ignores the IOException thrown when the process termination, which closes the input + // stream abruptly. + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala new file mode 100644 index 000000000000..1374a97476ca --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala @@ -0,0 +1,290 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.test + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, SQLContext, SQLImplicits} + +/** + * A collection of sample data used in SQL tests. + */ +private[sql] trait SQLTestData { self => + protected def _sqlContext: SQLContext + + // Helper object to import SQL implicits without a concrete SQLContext + private object internalImplicits extends SQLImplicits { + protected override def _sqlContext: SQLContext = self._sqlContext + } + + import internalImplicits._ + import SQLTestData._ + + // Note: all test data should be lazy because the SQLContext is not set up yet. + + protected lazy val testData: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + (1 to 100).map(i => TestData(i, i.toString))).toDF() + df.registerTempTable("testData") + df + } + + protected lazy val testData2: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + TestData2(1, 1) :: + TestData2(1, 2) :: + TestData2(2, 1) :: + TestData2(2, 2) :: + TestData2(3, 1) :: + TestData2(3, 2) :: Nil, 2).toDF() + df.registerTempTable("testData2") + df + } + + protected lazy val testData3: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + TestData3(1, None) :: + TestData3(2, Some(2)) :: Nil).toDF() + df.registerTempTable("testData3") + df + } + + protected lazy val negativeData: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + (1 to 100).map(i => TestData(-i, (-i).toString))).toDF() + df.registerTempTable("negativeData") + df + } + + protected lazy val largeAndSmallInts: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + LargeAndSmallInts(2147483644, 1) :: + LargeAndSmallInts(1, 2) :: + LargeAndSmallInts(2147483645, 1) :: + LargeAndSmallInts(2, 2) :: + LargeAndSmallInts(2147483646, 1) :: + LargeAndSmallInts(3, 2) :: Nil).toDF() + df.registerTempTable("largeAndSmallInts") + df + } + + protected lazy val decimalData: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + DecimalData(1, 1) :: + DecimalData(1, 2) :: + DecimalData(2, 1) :: + DecimalData(2, 2) :: + DecimalData(3, 1) :: + DecimalData(3, 2) :: Nil).toDF() + df.registerTempTable("decimalData") + df + } + + protected lazy val binaryData: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + BinaryData("12".getBytes, 1) :: + BinaryData("22".getBytes, 5) :: + BinaryData("122".getBytes, 3) :: + BinaryData("121".getBytes, 2) :: + BinaryData("123".getBytes, 4) :: Nil).toDF() + df.registerTempTable("binaryData") + df + } + + protected lazy val upperCaseData: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + UpperCaseData(1, "A") :: + UpperCaseData(2, "B") :: + UpperCaseData(3, "C") :: + UpperCaseData(4, "D") :: + UpperCaseData(5, "E") :: + UpperCaseData(6, "F") :: Nil).toDF() + df.registerTempTable("upperCaseData") + df + } + + protected lazy val lowerCaseData: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + LowerCaseData(1, "a") :: + LowerCaseData(2, "b") :: + LowerCaseData(3, "c") :: + LowerCaseData(4, "d") :: Nil).toDF() + df.registerTempTable("lowerCaseData") + df + } + + protected lazy val arrayData: RDD[ArrayData] = { + val rdd = _sqlContext.sparkContext.parallelize( + ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3))) :: + ArrayData(Seq(2, 3, 4), Seq(Seq(2, 3, 4))) :: Nil) + rdd.toDF().registerTempTable("arrayData") + rdd + } + + protected lazy val mapData: RDD[MapData] = { + val rdd = _sqlContext.sparkContext.parallelize( + MapData(Map(1 -> "a1", 2 -> "b1", 3 -> "c1", 4 -> "d1", 5 -> "e1")) :: + MapData(Map(1 -> "a2", 2 -> "b2", 3 -> "c2", 4 -> "d2")) :: + MapData(Map(1 -> "a3", 2 -> "b3", 3 -> "c3")) :: + MapData(Map(1 -> "a4", 2 -> "b4")) :: + MapData(Map(1 -> "a5")) :: Nil) + rdd.toDF().registerTempTable("mapData") + rdd + } + + protected lazy val repeatedData: RDD[StringData] = { + val rdd = _sqlContext.sparkContext.parallelize(List.fill(2)(StringData("test"))) + rdd.toDF().registerTempTable("repeatedData") + rdd + } + + protected lazy val nullableRepeatedData: RDD[StringData] = { + val rdd = _sqlContext.sparkContext.parallelize( + List.fill(2)(StringData(null)) ++ + List.fill(2)(StringData("test"))) + rdd.toDF().registerTempTable("nullableRepeatedData") + rdd + } + + protected lazy val nullInts: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + NullInts(1) :: + NullInts(2) :: + NullInts(3) :: + NullInts(null) :: Nil).toDF() + df.registerTempTable("nullInts") + df + } + + protected lazy val allNulls: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + NullInts(null) :: + NullInts(null) :: + NullInts(null) :: + NullInts(null) :: Nil).toDF() + df.registerTempTable("allNulls") + df + } + + protected lazy val nullStrings: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + NullStrings(1, "abc") :: + NullStrings(2, "ABC") :: + NullStrings(3, null) :: Nil).toDF() + df.registerTempTable("nullStrings") + df + } + + protected lazy val tableName: DataFrame = { + val df = _sqlContext.sparkContext.parallelize(TableName("test") :: Nil).toDF() + df.registerTempTable("tableName") + df + } + + protected lazy val unparsedStrings: RDD[String] = { + _sqlContext.sparkContext.parallelize( + "1, A1, true, null" :: + "2, B2, false, null" :: + "3, C3, true, null" :: + "4, D4, true, 2147483644" :: Nil) + } + + // An RDD with 4 elements and 8 partitions + protected lazy val withEmptyParts: RDD[IntField] = { + val rdd = _sqlContext.sparkContext.parallelize((1 to 4).map(IntField), 8) + rdd.toDF().registerTempTable("withEmptyParts") + rdd + } + + protected lazy val person: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + Person(0, "mike", 30) :: + Person(1, "jim", 20) :: Nil).toDF() + df.registerTempTable("person") + df + } + + protected lazy val salary: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + Salary(0, 2000.0) :: + Salary(1, 1000.0) :: Nil).toDF() + df.registerTempTable("salary") + df + } + + protected lazy val complexData: DataFrame = { + val df = _sqlContext.sparkContext.parallelize( + ComplexData(Map("1" -> 1), TestData(1, "1"), Seq(1, 1, 1), true) :: + ComplexData(Map("2" -> 2), TestData(2, "2"), Seq(2, 2, 2), false) :: + Nil).toDF() + df.registerTempTable("complexData") + df + } + + /** + * Initialize all test data such that all temp tables are properly registered. + */ + def loadTestData(): Unit = { + assert(_sqlContext != null, "attempted to initialize test data before SQLContext.") + testData + testData2 + testData3 + negativeData + largeAndSmallInts + decimalData + binaryData + upperCaseData + lowerCaseData + arrayData + mapData + repeatedData + nullableRepeatedData + nullInts + allNulls + nullStrings + tableName + unparsedStrings + withEmptyParts + person + salary + complexData + } +} + +/** + * Case classes used in test data. + */ +private[sql] object SQLTestData { + case class TestData(key: Int, value: String) + case class TestData2(a: Int, b: Int) + case class TestData3(a: Int, b: Option[Int]) + case class LargeAndSmallInts(a: Int, b: Int) + case class DecimalData(a: BigDecimal, b: BigDecimal) + case class BinaryData(a: Array[Byte], b: Int) + case class UpperCaseData(N: Int, L: String) + case class LowerCaseData(n: Int, l: String) + case class ArrayData(data: Seq[Int], nestedData: Seq[Seq[Int]]) + case class MapData(data: scala.collection.Map[Int, String]) + case class StringData(s: String) + case class IntField(i: Int) + case class NullInts(a: Integer) + case class NullStrings(n: Int, s: String) + case class TableName(tableName: String) + case class Person(id: Int, name: String, age: Int) + case class Salary(personId: Int, salary: Double) + case class ComplexData(m: Map[String, Int], s: TestData, a: Seq[Int], b: Boolean) +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 4c11acdab9ec..cdd691e03589 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -21,15 +21,71 @@ import java.io.File import java.util.UUID import scala.util.Try +import scala.language.implicitConversions + +import org.apache.hadoop.conf.Configuration +import org.scalatest.BeforeAndAfterAll import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.{DataFrame, SQLContext, SQLImplicits} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.util.Utils -trait SQLTestUtils { this: SparkFunSuite => - def sqlContext: SQLContext +/** + * Helper trait that should be extended by all SQL test suites. + * + * This allows subclasses to plugin a custom [[SQLContext]]. It comes with test data + * prepared in advance as well as all implicit conversions used extensively by dataframes. + * To use implicit methods, import `testImplicits._` instead of through the [[SQLContext]]. + * + * Subclasses should *not* create [[SQLContext]]s in the test suite constructor, which is + * prone to leaving multiple overlapping [[org.apache.spark.SparkContext]]s in the same JVM. + */ +private[sql] trait SQLTestUtils + extends SparkFunSuite + with BeforeAndAfterAll + with SQLTestData { self => + + protected def _sqlContext: SQLContext + + // Whether to materialize all test data before the first test is run + private var loadTestDataBeforeTests = false + + // Shorthand for running a query using our SQLContext + protected lazy val sql = _sqlContext.sql _ + + /** + * A helper object for importing SQL implicits. + * + * Note that the alternative of importing `sqlContext.implicits._` is not possible here. + * This is because we create the [[SQLContext]] immediately before the first test is run, + * but the implicits import is needed in the constructor. + */ + protected object testImplicits extends SQLImplicits { + protected override def _sqlContext: SQLContext = self._sqlContext + } + + /** + * Materialize the test data immediately after the [[SQLContext]] is set up. + * This is necessary if the data is accessed by name but not through direct reference. + */ + protected def setupTestData(): Unit = { + loadTestDataBeforeTests = true + } - protected def configuration = sqlContext.sparkContext.hadoopConfiguration + protected override def beforeAll(): Unit = { + super.beforeAll() + if (loadTestDataBeforeTests) { + loadTestData() + } + } + + /** + * The Hadoop configuration used by the active [[SQLContext]]. + */ + protected def configuration: Configuration = { + _sqlContext.sparkContext.hadoopConfiguration + } /** * Sets all SQL configurations specified in `pairs`, calls `f`, and then restore all SQL @@ -39,12 +95,12 @@ trait SQLTestUtils { this: SparkFunSuite => */ protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { val (keys, values) = pairs.unzip - val currentValues = keys.map(key => Try(sqlContext.conf.getConfString(key)).toOption) - (keys, values).zipped.foreach(sqlContext.conf.setConfString) + val currentValues = keys.map(key => Try(_sqlContext.conf.getConfString(key)).toOption) + (keys, values).zipped.foreach(_sqlContext.conf.setConfString) try f finally { keys.zip(currentValues).foreach { - case (key, Some(value)) => sqlContext.conf.setConfString(key, value) - case (key, None) => sqlContext.conf.unsetConf(key) + case (key, Some(value)) => _sqlContext.conf.setConfString(key, value) + case (key, None) => _sqlContext.conf.unsetConf(key) } } } @@ -76,7 +132,7 @@ trait SQLTestUtils { this: SparkFunSuite => * Drops temporary table `tableName` after calling `f`. */ protected def withTempTable(tableNames: String*)(f: => Unit): Unit = { - try f finally tableNames.foreach(sqlContext.dropTempTable) + try f finally tableNames.foreach(_sqlContext.dropTempTable) } /** @@ -85,7 +141,7 @@ trait SQLTestUtils { this: SparkFunSuite => protected def withTable(tableNames: String*)(f: => Unit): Unit = { try f finally { tableNames.foreach { name => - sqlContext.sql(s"DROP TABLE IF EXISTS $name") + _sqlContext.sql(s"DROP TABLE IF EXISTS $name") } } } @@ -98,12 +154,12 @@ trait SQLTestUtils { this: SparkFunSuite => val dbName = s"db_${UUID.randomUUID().toString.replace('-', '_')}" try { - sqlContext.sql(s"CREATE DATABASE $dbName") + _sqlContext.sql(s"CREATE DATABASE $dbName") } catch { case cause: Throwable => fail("Failed to create temporary database", cause) } - try f(dbName) finally sqlContext.sql(s"DROP DATABASE $dbName CASCADE") + try f(dbName) finally _sqlContext.sql(s"DROP DATABASE $dbName CASCADE") } /** @@ -111,7 +167,15 @@ trait SQLTestUtils { this: SparkFunSuite => * `f` returns. */ protected def activateDatabase(db: String)(f: => Unit): Unit = { - sqlContext.sql(s"USE $db") - try f finally sqlContext.sql(s"USE default") + _sqlContext.sql(s"USE $db") + try f finally _sqlContext.sql(s"USE default") + } + + /** + * Turn a logical plan into a [[DataFrame]]. This should be removed once we have an easier + * way to construct [[DataFrame]] directly out of local data without relying on implicits. + */ + protected implicit def logicalPlanToSparkQuery(plan: LogicalPlan): DataFrame = { + DataFrame(_sqlContext, plan) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala new file mode 100644 index 000000000000..d23c6a073266 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.test + +import org.apache.spark.sql.{ColumnName, SQLContext} + + +/** + * Helper trait for SQL test suites where all tests share a single [[TestSQLContext]]. + */ +trait SharedSQLContext extends SQLTestUtils { + + /** + * The [[TestSQLContext]] to use for all tests in this suite. + * + * By default, the underlying [[org.apache.spark.SparkContext]] will be run in local + * mode with the default test configurations. + */ + private var _ctx: TestSQLContext = null + + /** + * The [[TestSQLContext]] to use for all tests in this suite. + */ + protected def ctx: TestSQLContext = _ctx + protected def sqlContext: TestSQLContext = _ctx + protected override def _sqlContext: SQLContext = _ctx + + /** + * Initialize the [[TestSQLContext]]. + */ + protected override def beforeAll(): Unit = { + if (_ctx == null) { + _ctx = new TestSQLContext + } + // Ensure we have initialized the context before calling parent code + super.beforeAll() + } + + /** + * Stop the underlying [[org.apache.spark.SparkContext]], if any. + */ + protected override def afterAll(): Unit = { + try { + if (_ctx != null) { + _ctx.sparkContext.stop() + _ctx = null + } + } finally { + super.afterAll() + } + } + + /** + * Converts $"col name" into an [[Column]]. + * @since 1.3.0 + */ + // This must be duplicated here to preserve binary compatibility with Spark < 1.5. + implicit class StringToColumn(val sc: StringContext) { + def $(args: Any*): ColumnName = { + new ColumnName(sc.s(args: _*)) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala similarity index 54% rename from sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala rename to sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala index b3a4231da91c..92ef2f7d74ba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala @@ -17,40 +17,36 @@ package org.apache.spark.sql.test -import scala.language.implicitConversions - import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext} -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan - -/** A SQLContext that can be used for local testing. */ -class LocalSQLContext - extends SQLContext( - new SparkContext("local[2]", "TestSQLContext", new SparkConf() - .set("spark.sql.testkey", "true") - // SPARK-8910 - .set("spark.ui.enabled", "false"))) { - - override protected[sql] def createSession(): SQLSession = { - new this.SQLSession() +import org.apache.spark.sql.{SQLConf, SQLContext} + + +/** + * A special [[SQLContext]] prepared for testing. + */ +private[sql] class TestSQLContext(sc: SparkContext) extends SQLContext(sc) { self => + + def this() { + this(new SparkContext("local[2]", "test-sql-context", + new SparkConf().set("spark.sql.testkey", "true"))) } + // Use fewer partitions to speed up testing + protected[sql] override def createSession(): SQLSession = new this.SQLSession() + + /** A special [[SQLSession]] that uses fewer shuffle partitions than normal. */ protected[sql] class SQLSession extends super.SQLSession { protected[sql] override lazy val conf: SQLConf = new SQLConf { - /** Fewer partitions to speed up testing. */ override def numShufflePartitions: Int = this.getConf(SQLConf.SHUFFLE_PARTITIONS, 5) } } - /** - * Turn a logical plan into a [[DataFrame]]. This should be removed once we have an easier way to - * construct [[DataFrame]] directly out of local data without relying on implicits. - */ - protected[sql] implicit def logicalPlanToSparkQuery(plan: LogicalPlan): DataFrame = { - DataFrame(this, plan) + // Needed for Java tests + def loadTestData(): Unit = { + testData.loadTestData() } + private object testData extends SQLTestData { + protected override def _sqlContext: SQLContext = self + } } - -object TestSQLContext extends LocalSQLContext - diff --git a/sql/core/src/test/scripts/gen-code.sh b/sql/core/src/test/scripts/gen-avro.sh similarity index 76% rename from sql/core/src/test/scripts/gen-code.sh rename to sql/core/src/test/scripts/gen-avro.sh index 5d8d8ad08555..48174b287fd7 100755 --- a/sql/core/src/test/scripts/gen-code.sh +++ b/sql/core/src/test/scripts/gen-avro.sh @@ -22,10 +22,9 @@ cd - rm -rf $BASEDIR/gen-java mkdir -p $BASEDIR/gen-java -thrift\ - --gen java\ - -out $BASEDIR/gen-java\ - $BASEDIR/thrift/parquet-compat.thrift - -avro-tools idl $BASEDIR/avro/parquet-compat.avdl > $BASEDIR/avro/parquet-compat.avpr -avro-tools compile -string protocol $BASEDIR/avro/parquet-compat.avpr $BASEDIR/gen-java +for input in `ls $BASEDIR/avro/*.avdl`; do + filename=$(basename "$input") + filename="${filename%.*}" + avro-tools idl $input> $BASEDIR/avro/${filename}.avpr + avro-tools compile -string protocol $BASEDIR/avro/${filename}.avpr $BASEDIR/gen-java +done diff --git a/sql/core/src/test/scripts/gen-thrift.sh b/sql/core/src/test/scripts/gen-thrift.sh new file mode 100755 index 000000000000..ada432c68ab9 --- /dev/null +++ b/sql/core/src/test/scripts/gen-thrift.sh @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +cd $(dirname $0)/.. +BASEDIR=`pwd` +cd - + +rm -rf $BASEDIR/gen-java +mkdir -p $BASEDIR/gen-java + +for input in `ls $BASEDIR/thrift/*.thrift`; do + thrift --gen java -out $BASEDIR/gen-java $input +done diff --git a/sql/core/src/test/thrift/parquet-compat.thrift b/sql/core/src/test/thrift/parquet-compat.thrift index fa5ed8c62306..98bf778aec5d 100644 --- a/sql/core/src/test/thrift/parquet-compat.thrift +++ b/sql/core/src/test/thrift/parquet-compat.thrift @@ -15,7 +15,7 @@ * limitations under the License. */ -namespace java org.apache.spark.sql.parquet.test.thrift +namespace java org.apache.spark.sql.execution.datasources.parquet.test.thrift enum Suit { SPADES, diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index 73e6ccdb1eaf..e9bace983d25 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml @@ -60,21 +60,38 @@ ${hive.group} hive-jdbc + + ${hive.group} + hive-service + ${hive.group} hive-beeline + + com.sun.jersey + jersey-core + + + com.sun.jersey + jersey-json + + + com.sun.jersey + jersey-server + org.seleniumhq.selenium selenium-java test - - - io.netty - netty - - + + + org.apache.spark + spark-sql_${scala.binary.version} + test-jar + ${project.version} + test diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/hive/service/server/HiveServerServerOptionsProcessor.scala b/sql/hive-thriftserver/src/main/scala/org/apache/hive/service/server/HiveServerServerOptionsProcessor.scala new file mode 100644 index 000000000000..2228f651e238 --- /dev/null +++ b/sql/hive-thriftserver/src/main/scala/org/apache/hive/service/server/HiveServerServerOptionsProcessor.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hive.service.server + +import org.apache.hive.service.server.HiveServer2.{StartOptionExecutor, ServerOptionsProcessor} + +/** + * Class to upgrade a package-private class to public, and + * implement a `process()` operation consistent with + * the behavior of older Hive versions + * @param serverName name of the hive server + */ +private[apache] class HiveServerServerOptionsProcessor(serverName: String) + extends ServerOptionsProcessor(serverName) { + + def process(args: Array[String]): Boolean = { + // A parse failure automatically triggers a system exit + val response = super.parse(args) + val executor = response.getServerOptionsExecutor() + // return true if the parsed option was to start the service + executor.isInstanceOf[StartOptionExecutor] + } +} diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala index b7db80d93f85..dd9fef9206d0 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala @@ -17,6 +17,9 @@ package org.apache.spark.sql.hive.thriftserver +import java.util.Locale +import java.util.concurrent.atomic.AtomicBoolean + import scala.collection.mutable import scala.collection.mutable.ArrayBuffer @@ -24,7 +27,7 @@ import org.apache.commons.logging.LogFactory import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hive.service.cli.thrift.{ThriftBinaryCLIService, ThriftHttpCLIService} -import org.apache.hive.service.server.{HiveServer2, ServerOptionsProcessor} +import org.apache.hive.service.server.{HiveServerServerOptionsProcessor, HiveServer2} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd, SparkListenerJobStart} @@ -32,7 +35,7 @@ import org.apache.spark.sql.SQLConf import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab -import org.apache.spark.util.Utils +import org.apache.spark.util.{ShutdownHookManager, Utils} import org.apache.spark.{Logging, SparkContext} @@ -65,7 +68,7 @@ object HiveThriftServer2 extends Logging { } def main(args: Array[String]) { - val optionsProcessor = new ServerOptionsProcessor("HiveThriftServer2") + val optionsProcessor = new HiveServerServerOptionsProcessor("HiveThriftServer2") if (!optionsProcessor.process(args)) { System.exit(-1) } @@ -73,7 +76,7 @@ object HiveThriftServer2 extends Logging { logInfo("Starting SparkContext") SparkSQLEnv.init() - Utils.addShutdownHook { () => + ShutdownHookManager.addShutdownHook { () => SparkSQLEnv.stop() uiTab.foreach(_.detach()) } @@ -149,16 +152,26 @@ object HiveThriftServer2 extends Logging { override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { server.stop() } - var onlineSessionNum: Int = 0 - val sessionList = new mutable.LinkedHashMap[String, SessionInfo] - val executionList = new mutable.LinkedHashMap[String, ExecutionInfo] - val retainedStatements = - conf.getConf(SQLConf.THRIFTSERVER_UI_STATEMENT_LIMIT) - val retainedSessions = - conf.getConf(SQLConf.THRIFTSERVER_UI_SESSION_LIMIT) - var totalRunning = 0 - - override def onJobStart(jobStart: SparkListenerJobStart): Unit = { + private var onlineSessionNum: Int = 0 + private val sessionList = new mutable.LinkedHashMap[String, SessionInfo] + private val executionList = new mutable.LinkedHashMap[String, ExecutionInfo] + private val retainedStatements = conf.getConf(SQLConf.THRIFTSERVER_UI_STATEMENT_LIMIT) + private val retainedSessions = conf.getConf(SQLConf.THRIFTSERVER_UI_SESSION_LIMIT) + private var totalRunning = 0 + + def getOnlineSessionNum: Int = synchronized { onlineSessionNum } + + def getTotalRunning: Int = synchronized { totalRunning } + + def getSessionList: Seq[SessionInfo] = synchronized { sessionList.values.toSeq } + + def getSession(sessionId: String): Option[SessionInfo] = synchronized { + sessionList.get(sessionId) + } + + def getExecutionList: Seq[ExecutionInfo] = synchronized { executionList.values.toSeq } + + override def onJobStart(jobStart: SparkListenerJobStart): Unit = synchronized { for { props <- Option(jobStart.properties) groupId <- Option(props.getProperty(SparkContext.SPARK_JOB_GROUP_ID)) @@ -170,13 +183,15 @@ object HiveThriftServer2 extends Logging { } def onSessionCreated(ip: String, sessionId: String, userName: String = "UNKNOWN"): Unit = { - val info = new SessionInfo(sessionId, System.currentTimeMillis, ip, userName) - sessionList.put(sessionId, info) - onlineSessionNum += 1 - trimSessionIfNecessary() + synchronized { + val info = new SessionInfo(sessionId, System.currentTimeMillis, ip, userName) + sessionList.put(sessionId, info) + onlineSessionNum += 1 + trimSessionIfNecessary() + } } - def onSessionClosed(sessionId: String): Unit = { + def onSessionClosed(sessionId: String): Unit = synchronized { sessionList(sessionId).finishTimestamp = System.currentTimeMillis onlineSessionNum -= 1 trimSessionIfNecessary() @@ -187,7 +202,7 @@ object HiveThriftServer2 extends Logging { sessionId: String, statement: String, groupId: String, - userName: String = "UNKNOWN"): Unit = { + userName: String = "UNKNOWN"): Unit = synchronized { val info = new ExecutionInfo(statement, sessionId, System.currentTimeMillis, userName) info.state = ExecutionState.STARTED executionList.put(id, info) @@ -197,27 +212,29 @@ object HiveThriftServer2 extends Logging { totalRunning += 1 } - def onStatementParsed(id: String, executionPlan: String): Unit = { + def onStatementParsed(id: String, executionPlan: String): Unit = synchronized { executionList(id).executePlan = executionPlan executionList(id).state = ExecutionState.COMPILED } def onStatementError(id: String, errorMessage: String, errorTrace: String): Unit = { - executionList(id).finishTimestamp = System.currentTimeMillis - executionList(id).detail = errorMessage - executionList(id).state = ExecutionState.FAILED - totalRunning -= 1 - trimExecutionIfNecessary() + synchronized { + executionList(id).finishTimestamp = System.currentTimeMillis + executionList(id).detail = errorMessage + executionList(id).state = ExecutionState.FAILED + totalRunning -= 1 + trimExecutionIfNecessary() + } } - def onStatementFinish(id: String): Unit = { + def onStatementFinish(id: String): Unit = synchronized { executionList(id).finishTimestamp = System.currentTimeMillis executionList(id).state = ExecutionState.FINISHED totalRunning -= 1 trimExecutionIfNecessary() } - private def trimExecutionIfNecessary() = synchronized { + private def trimExecutionIfNecessary() = { if (executionList.size > retainedStatements) { val toRemove = math.max(retainedStatements / 10, 1) executionList.filter(_._2.finishTimestamp != 0).take(toRemove).foreach { s => @@ -226,7 +243,7 @@ object HiveThriftServer2 extends Logging { } } - private def trimSessionIfNecessary() = synchronized { + private def trimSessionIfNecessary() = { if (sessionList.size > retainedSessions) { val toRemove = math.max(retainedSessions / 10, 1) sessionList.filter(_._2.finishTimestamp != 0).take(toRemove).foreach { s => @@ -241,9 +258,12 @@ object HiveThriftServer2 extends Logging { private[hive] class HiveThriftServer2(hiveContext: HiveContext) extends HiveServer2 with ReflectedCompositeService { + // state is tracked internally so that the server only attempts to shut down if it successfully + // started, and then once only. + private val started = new AtomicBoolean(false) override def init(hiveConf: HiveConf) { - val sparkSqlCliService = new SparkSQLCLIService(hiveContext) + val sparkSqlCliService = new SparkSQLCLIService(this, hiveContext) setSuperField(this, "cliService", sparkSqlCliService) addService(sparkSqlCliService) @@ -259,8 +279,19 @@ private[hive] class HiveThriftServer2(hiveContext: HiveContext) } private def isHTTPTransportMode(hiveConf: HiveConf): Boolean = { - val transportMode: String = hiveConf.getVar(ConfVars.HIVE_SERVER2_TRANSPORT_MODE) - transportMode.equalsIgnoreCase("http") + val transportMode = hiveConf.getVar(ConfVars.HIVE_SERVER2_TRANSPORT_MODE) + transportMode.toLowerCase(Locale.ENGLISH).equals("http") } + + override def start(): Unit = { + super.start() + started.set(true) + } + + override def stop(): Unit = { + if (started.getAndSet(false)) { + super.stop() + } + } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala index e8758887ff3a..02cc7e5efa52 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala @@ -32,8 +32,7 @@ import org.apache.hive.service.cli._ import org.apache.hadoop.hive.ql.metadata.Hive import org.apache.hadoop.hive.ql.metadata.HiveException import org.apache.hadoop.hive.ql.session.SessionState -import org.apache.hadoop.hive.shims.ShimLoader -import org.apache.hadoop.security.UserGroupInformation +import org.apache.hadoop.hive.shims.Utils import org.apache.hive.service.cli.operation.ExecuteStatementOperation import org.apache.hive.service.cli.session.HiveSession @@ -146,7 +145,7 @@ private[hive] class SparkExecuteStatementOperation( } else { val parentSessionState = SessionState.get() val hiveConf = getConfigForOperation() - val sparkServiceUGI = ShimLoader.getHadoopShims.getUGIForConf(hiveConf) + val sparkServiceUGI = Utils.getUGI() val sessionHive = getCurrentHive() val currentSqlSession = hiveContext.currentSession @@ -160,6 +159,12 @@ private[hive] class SparkExecuteStatementOperation( // User information is part of the metastore client member in Hive hiveContext.setSession(currentSqlSession) + // Always use the latest class loader provided by executionHive's state. + val executionHiveClassLoader = + hiveContext.executionHive.state.getConf.getClassLoader + sessionHive.getConf.setClassLoader(executionHiveClassLoader) + parentSessionState.getConf.setClassLoader(executionHiveClassLoader) + Hive.set(sessionHive) SessionState.setCurrentSessionState(parentSessionState) try { @@ -174,7 +179,7 @@ private[hive] class SparkExecuteStatementOperation( } try { - ShimLoader.getHadoopShims().doAs(sparkServiceUGI, doAsAction) + sparkServiceUGI.doAs(doAsAction) } catch { case e: Exception => setOperationException(new HiveSQLException(e)) @@ -201,7 +206,7 @@ private[hive] class SparkExecuteStatementOperation( } } - private def runInternal(): Unit = { + override def runInternal(): Unit = { statementId = UUID.randomUUID().toString logInfo(s"Running query '$statement' with $statementId") setState(OperationState.RUNNING) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala index f66a17b20915..e58f8ca25476 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala @@ -20,9 +20,10 @@ package org.apache.spark.sql.hive.thriftserver import scala.collection.JavaConversions._ import java.io._ -import java.util.{ArrayList => JArrayList} +import java.util.{ArrayList => JArrayList, Locale} -import jline.{ConsoleReader, History} +import jline.console.ConsoleReader +import jline.console.history.FileHistory import org.apache.commons.lang3.StringUtils import org.apache.commons.logging.LogFactory @@ -38,8 +39,12 @@ import org.apache.thrift.transport.TSocket import org.apache.spark.Logging import org.apache.spark.sql.hive.HiveContext -import org.apache.spark.util.Utils +import org.apache.spark.util.{ShutdownHookManager, Utils} +/** + * This code doesn't support remote connections in Hive 1.2+, as the underlying CliDriver + * has dropped its support. + */ private[hive] object SparkSQLCLIDriver extends Logging { private var prompt = "spark-sql" private var continuedPrompt = "".padTo(prompt.length, ' ') @@ -109,18 +114,11 @@ private[hive] object SparkSQLCLIDriver extends Logging { SessionState.start(sessionState) // Clean up after we exit - Utils.addShutdownHook { () => SparkSQLEnv.stop() } + ShutdownHookManager.addShutdownHook { () => SparkSQLEnv.stop() } + val remoteMode = isRemoteMode(sessionState) // "-h" option has been passed, so connect to Hive thrift server. - if (sessionState.getHost != null) { - sessionState.connect() - if (sessionState.isRemoteMode) { - prompt = s"[${sessionState.getHost}:${sessionState.getPort}]" + prompt - continuedPrompt = "".padTo(prompt.length, ' ') - } - } - - if (!sessionState.isRemoteMode) { + if (!remoteMode) { // Hadoop-20 and above - we need to augment classpath using hiveconf // components. // See also: code in ExecDriver.java @@ -131,6 +129,9 @@ private[hive] object SparkSQLCLIDriver extends Logging { } conf.setClassLoader(loader) Thread.currentThread().setContextClassLoader(loader) + } else { + // Hive 1.2 + not supported in CLI + throw new RuntimeException("Remote operations not supported") } val cli = new SparkSQLCLIDriver @@ -170,15 +171,16 @@ private[hive] object SparkSQLCLIDriver extends Logging { val reader = new ConsoleReader() reader.setBellEnabled(false) + reader.setExpandEvents(false) // reader.setDebug(new PrintWriter(new FileWriter("writer.debug", true))) - CliDriver.getCommandCompletor.foreach((e) => reader.addCompletor(e)) + CliDriver.getCommandCompleter.foreach((e) => reader.addCompleter(e)) val historyDirectory = System.getProperty("user.home") try { if (new File(historyDirectory).exists()) { val historyFile = historyDirectory + File.separator + ".hivehistory" - reader.setHistory(new History(new File(historyFile))) + reader.setHistory(new FileHistory(new File(historyFile))) } else { logWarning("WARNING: Directory for Hive history file: " + historyDirectory + " does not exist. History will not be available during this session.") @@ -190,10 +192,14 @@ private[hive] object SparkSQLCLIDriver extends Logging { logWarning(e.getMessage) } + // TODO: missing +/* val clientTransportTSocketField = classOf[CliSessionState].getDeclaredField("transport") clientTransportTSocketField.setAccessible(true) transport = clientTransportTSocketField.get(sessionState).asInstanceOf[TSocket] +*/ + transport = null var ret = 0 var prefix = "" @@ -230,6 +236,13 @@ private[hive] object SparkSQLCLIDriver extends Logging { System.exit(ret) } + + + def isRemoteMode(state: CliSessionState): Boolean = { + // sessionState.isRemoteMode + state.isHiveServerQuery + } + } private[hive] class SparkSQLCLIDriver extends CliDriver with Logging { @@ -239,25 +252,33 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging { private val console = new SessionState.LogHelper(LOG) + private val isRemoteMode = { + SparkSQLCLIDriver.isRemoteMode(sessionState) + } + private val conf: Configuration = if (sessionState != null) sessionState.getConf else new Configuration() // Force initializing SparkSQLEnv. This is put here but not object SparkSQLCliDriver // because the Hive unit tests do not go through the main() code path. - if (!sessionState.isRemoteMode) { + if (!isRemoteMode) { SparkSQLEnv.init() + } else { + // Hive 1.2 + not supported in CLI + throw new RuntimeException("Remote operations not supported") } override def processCmd(cmd: String): Int = { val cmd_trimmed: String = cmd.trim() + val cmd_lower = cmd_trimmed.toLowerCase(Locale.ENGLISH) val tokens: Array[String] = cmd_trimmed.split("\\s+") val cmd_1: String = cmd_trimmed.substring(tokens(0).length()).trim() - if (cmd_trimmed.toLowerCase.equals("quit") || - cmd_trimmed.toLowerCase.equals("exit") || - tokens(0).equalsIgnoreCase("source") || + if (cmd_lower.equals("quit") || + cmd_lower.equals("exit") || + tokens(0).toLowerCase(Locale.ENGLISH).equals("source") || cmd_trimmed.startsWith("!") || tokens(0).toLowerCase.equals("list") || - sessionState.isRemoteMode) { + isRemoteMode) { val start = System.currentTimeMillis() super.processCmd(cmd) val end = System.currentTimeMillis() diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala index 41f647d5f8c5..644165acf70a 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala @@ -23,11 +23,12 @@ import javax.security.auth.login.LoginException import org.apache.commons.logging.Log import org.apache.hadoop.hive.conf.HiveConf -import org.apache.hadoop.hive.shims.ShimLoader +import org.apache.hadoop.hive.shims.Utils import org.apache.hadoop.security.UserGroupInformation import org.apache.hive.service.Service.STATE import org.apache.hive.service.auth.HiveAuthFactory import org.apache.hive.service.cli._ +import org.apache.hive.service.server.HiveServer2 import org.apache.hive.service.{AbstractService, Service, ServiceException} import org.apache.spark.sql.hive.HiveContext @@ -35,22 +36,22 @@ import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import scala.collection.JavaConversions._ -private[hive] class SparkSQLCLIService(hiveContext: HiveContext) - extends CLIService +private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, hiveContext: HiveContext) + extends CLIService(hiveServer) with ReflectedCompositeService { override def init(hiveConf: HiveConf) { setSuperField(this, "hiveConf", hiveConf) - val sparkSqlSessionManager = new SparkSQLSessionManager(hiveContext) + val sparkSqlSessionManager = new SparkSQLSessionManager(hiveServer, hiveContext) setSuperField(this, "sessionManager", sparkSqlSessionManager) addService(sparkSqlSessionManager) var sparkServiceUGI: UserGroupInformation = null - if (ShimLoader.getHadoopShims.isSecurityEnabled) { + if (UserGroupInformation.isSecurityEnabled) { try { HiveAuthFactory.loginFromKeytab(hiveConf) - sparkServiceUGI = ShimLoader.getHadoopShims.getUGIForConf(hiveConf) + sparkServiceUGI = Utils.getUGI() setSuperField(this, "serviceUGI", sparkServiceUGI) } catch { case e @ (_: IOException | _: LoginException) => diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala index 2d5ee6800228..92ac0ec3fca2 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala @@ -25,14 +25,15 @@ import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hive.service.cli.SessionHandle import org.apache.hive.service.cli.session.SessionManager import org.apache.hive.service.cli.thrift.TProtocolVersion +import org.apache.hive.service.server.HiveServer2 import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager -private[hive] class SparkSQLSessionManager(hiveContext: HiveContext) - extends SessionManager +private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext: HiveContext) + extends SessionManager(hiveServer) with ReflectedCompositeService { private lazy val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext) @@ -55,12 +56,14 @@ private[hive] class SparkSQLSessionManager(hiveContext: HiveContext) protocol: TProtocolVersion, username: String, passwd: String, + ipAddress: String, sessionConf: java.util.Map[String, String], withImpersonation: Boolean, delegationToken: String): SessionHandle = { hiveContext.openSession() - val sessionHandle = super.openSession( - protocol, username, passwd, sessionConf, withImpersonation, delegationToken) + val sessionHandle = + super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation, + delegationToken) val session = super.getSession(sessionHandle) HiveThriftServer2.listener.onSessionCreated( session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala index 10c83d8b27a2..e990bd06011f 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala @@ -39,14 +39,16 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage("" /** Render the page */ def render(request: HttpServletRequest): Seq[Node] = { val content = - generateBasicStats() ++ -
++ -

- {listener.onlineSessionNum} session(s) are online, - running {listener.totalRunning} SQL statement(s) -

++ - generateSessionStatsTable() ++ - generateSQLStatsTable() + listener.synchronized { // make sure all parts in this page are consistent + generateBasicStats() ++ +
++ +

+ {listener.getOnlineSessionNum} session(s) are online, + running {listener.getTotalRunning} SQL statement(s) +

++ + generateSessionStatsTable() ++ + generateSQLStatsTable() + } UIUtils.headerSparkPage("JDBC/ODBC Server", content, parent, Some(5000)) } @@ -65,11 +67,11 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage("" /** Generate stats of batch statements of the thrift server program */ private def generateSQLStatsTable(): Seq[Node] = { - val numStatement = listener.executionList.size + val numStatement = listener.getExecutionList.size val table = if (numStatement > 0) { val headerRow = Seq("User", "JobID", "GroupID", "Start Time", "Finish Time", "Duration", "Statement", "State", "Detail") - val dataRows = listener.executionList.values + val dataRows = listener.getExecutionList def generateDataRow(info: ExecutionInfo): Seq[Node] = { val jobLink = info.jobId.map { id: String => @@ -136,15 +138,15 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage("" /** Generate stats of batch sessions of the thrift server program */ private def generateSessionStatsTable(): Seq[Node] = { - val numBatches = listener.sessionList.size + val sessionList = listener.getSessionList + val numBatches = sessionList.size val table = if (numBatches > 0) { - val dataRows = - listener.sessionList.values + val dataRows = sessionList val headerRow = Seq("User", "IP", "Session ID", "Start Time", "Finish Time", "Duration", "Total Execute") def generateDataRow(session: SessionInfo): Seq[Node] = { - val sessionLink = "%s/sql/session?id=%s" - .format(UIUtils.prependBaseUri(parent.basePath), session.sessionId) + val sessionLink = "%s/%s/session?id=%s" + .format(UIUtils.prependBaseUri(parent.basePath), parent.prefix, session.sessionId)
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala index 3b01afa603ce..af16cb31df18 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala @@ -40,21 +40,22 @@ private[ui] class ThriftServerSessionPage(parent: ThriftServerTab) def render(request: HttpServletRequest): Seq[Node] = { val parameterId = request.getParameter("id") require(parameterId != null && parameterId.nonEmpty, "Missing id parameter") - val sessionStat = listener.sessionList.find(stat => { - stat._1 == parameterId - }).getOrElse(null) - require(sessionStat != null, "Invalid sessionID[" + parameterId + "]") val content = - generateBasicStats() ++ -
++ -

- User {sessionStat._2.userName}, - IP {sessionStat._2.ip}, - Session created at {formatDate(sessionStat._2.startTimestamp)}, - Total run {sessionStat._2.totalExecution} SQL -

++ - generateSQLStatsTable(sessionStat._2.sessionId) + listener.synchronized { // make sure all parts in this page are consistent + val sessionStat = listener.getSession(parameterId).getOrElse(null) + require(sessionStat != null, "Invalid sessionID[" + parameterId + "]") + + generateBasicStats() ++ +
++ +

+ User {sessionStat.userName}, + IP {sessionStat.ip}, + Session created at {formatDate(sessionStat.startTimestamp)}, + Total run {sessionStat.totalExecution} SQL +

++ + generateSQLStatsTable(sessionStat.sessionId) + } UIUtils.headerSparkPage("JDBC/ODBC Session", content, parent, Some(5000)) } @@ -73,13 +74,13 @@ private[ui] class ThriftServerSessionPage(parent: ThriftServerTab) /** Generate stats of batch statements of the thrift server program */ private def generateSQLStatsTable(sessionID: String): Seq[Node] = { - val executionList = listener.executionList - .filter(_._2.sessionId == sessionID) + val executionList = listener.getExecutionList + .filter(_.sessionId == sessionID) val numStatement = executionList.size val table = if (numStatement > 0) { val headerRow = Seq("User", "JobID", "GroupID", "Start Time", "Finish Time", "Duration", "Statement", "State", "Detail") - val dataRows = executionList.values.toSeq.sortBy(_.startTimestamp).reverse + val dataRows = executionList.sortBy(_.startTimestamp).reverse def generateDataRow(info: ExecutionInfo): Seq[Node] = { val jobLink = info.jobId.map { id: String => @@ -146,10 +147,11 @@ private[ui] class ThriftServerSessionPage(parent: ThriftServerTab) /** Generate stats of batch sessions of the thrift server program */ private def generateSessionStatsTable(): Seq[Node] = { - val numBatches = listener.sessionList.size + val sessionList = listener.getSessionList + val numBatches = sessionList.size val table = if (numBatches > 0) { val dataRows = - listener.sessionList.values.toSeq.sortBy(_.startTimestamp).reverse.map ( session => + sessionList.sortBy(_.startTimestamp).reverse.map ( session => Seq( session.userName, session.ip, diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala index 94fd8a6bb60b..4eabeaa6735e 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala @@ -27,9 +27,9 @@ import org.apache.spark.{SparkContext, Logging, SparkException} * This assumes the given SparkContext has enabled its SparkUI. */ private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) - extends SparkUITab(getSparkUI(sparkContext), "sql") with Logging { + extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { - override val name = "SQL" + override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala index df80d04b4080..e59a14ec00d5 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala @@ -18,17 +18,19 @@ package org.apache.spark.sql.hive.thriftserver import java.io._ +import java.sql.Timestamp +import java.util.Date import scala.collection.mutable.ArrayBuffer import scala.concurrent.duration._ import scala.concurrent.{Await, Promise} -import scala.sys.process.{Process, ProcessLogger} +import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.scalatest.BeforeAndAfter -import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.util.Utils +import org.apache.spark.{Logging, SparkFunSuite} /** * A test suite for the `spark-sql` CLI tool. Note that all test cases share the same temporary @@ -37,43 +39,62 @@ import org.apache.spark.util.Utils class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging { val warehousePath = Utils.createTempDir() val metastorePath = Utils.createTempDir() + val scratchDirPath = Utils.createTempDir() before { - warehousePath.delete() - metastorePath.delete() + warehousePath.delete() + metastorePath.delete() + scratchDirPath.delete() } after { - warehousePath.delete() - metastorePath.delete() + warehousePath.delete() + metastorePath.delete() + scratchDirPath.delete() } + /** + * Run a CLI operation and expect all the queries and expected answers to be returned. + * @param timeout maximum time for the commands to complete + * @param extraArgs any extra arguments + * @param errorResponses a sequence of strings whose presence in the stdout of the forked process + * is taken as an immediate error condition. That is: if a line beginning + * with one of these strings is found, fail the test immediately. + * The default value is `Seq("Error:")` + * + * @param queriesAndExpectedAnswers one or more tupes of query + answer + */ def runCliWithin( timeout: FiniteDuration, - extraArgs: Seq[String] = Seq.empty)( + extraArgs: Seq[String] = Seq.empty, + errorResponses: Seq[String] = Seq("Error:"))( queriesAndExpectedAnswers: (String, String)*): Unit = { val (queries, expectedAnswers) = queriesAndExpectedAnswers.unzip - val cliScript = "../../bin/spark-sql".split("/").mkString(File.separator) + // Explicitly adds ENTER for each statement to make sure they are actually entered into the CLI. + val queriesString = queries.map(_ + "\n").mkString val command = { + val cliScript = "../../bin/spark-sql".split("/").mkString(File.separator) val jdbcUrl = s"jdbc:derby:;databaseName=$metastorePath;create=true" s"""$cliScript | --master local | --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$jdbcUrl | --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath + | --hiveconf ${ConfVars.SCRATCHDIR}=$scratchDirPath """.stripMargin.split("\\s+").toSeq ++ extraArgs } var next = 0 val foundAllExpectedAnswers = Promise.apply[Unit]() - // Explicitly adds ENTER for each statement to make sure they are actually entered into the CLI. - val queryStream = new ByteArrayInputStream(queries.map(_ + "\n").mkString.getBytes) val buffer = new ArrayBuffer[String]() val lock = new Object def captureOutput(source: String)(line: String): Unit = lock.synchronized { - buffer += s"$source> $line" + // This test suite sometimes gets extremely slow out of unknown reason on Jenkins. Here we + // add a timestamp to provide more diagnosis information. + buffer += s"${new Timestamp(new Date().getTime)} - $source> $line" + // If we haven't found all expected answers and another expected answer comes up... if (next < expectedAnswers.size && line.startsWith(expectedAnswers(next))) { next += 1 @@ -81,23 +102,36 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging { if (next == expectedAnswers.size) { foundAllExpectedAnswers.trySuccess(()) } + } else { + errorResponses.foreach { r => + if (line.startsWith(r)) { + foundAllExpectedAnswers.tryFailure( + new RuntimeException(s"Failed with error line '$line'")) + } + } } } - // Searching expected output line from both stdout and stderr of the CLI process - val process = (Process(command, None) #< queryStream).run( - ProcessLogger(captureOutput("stdout"), captureOutput("stderr"))) + val process = new ProcessBuilder(command: _*).start() + + val stdinWriter = new OutputStreamWriter(process.getOutputStream) + stdinWriter.write(queriesString) + stdinWriter.flush() + stdinWriter.close() + + new ProcessOutputCapturer(process.getInputStream, captureOutput("stdout")).start() + new ProcessOutputCapturer(process.getErrorStream, captureOutput("stderr")).start() try { Await.result(foundAllExpectedAnswers.future, timeout) } catch { case cause: Throwable => - logError( + val message = s""" |======================= |CliSuite failure output |======================= |Spark SQL CLI command line: ${command.mkString(" ")} - | + |Exception: $cause |Executed query $next "${queries(next)}", |But failed to capture expected output "${expectedAnswers(next)}" within $timeout. | @@ -105,8 +139,9 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging { |=========================== |End CliSuite failure output |=========================== - """.stripMargin, cause) - throw cause + """.stripMargin + logError(message, cause) + fail(message, cause) } finally { process.destroy() } diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala index 39b31523e07c..b72249b3bf8c 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala @@ -19,14 +19,12 @@ package org.apache.spark.sql.hive.thriftserver import java.io.File import java.net.URL -import java.nio.charset.StandardCharsets import java.sql.{Date, DriverManager, SQLException, Statement} import scala.collection.mutable.ArrayBuffer +import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.duration._ import scala.concurrent.{Await, Promise, future} -import scala.concurrent.ExecutionContext.Implicits.global -import scala.sys.process.{Process, ProcessLogger} import scala.util.{Random, Try} import com.google.common.base.Charsets.UTF_8 @@ -41,9 +39,10 @@ import org.apache.thrift.protocol.TBinaryProtocol import org.apache.thrift.transport.TSocket import org.scalatest.BeforeAndAfterAll -import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.sql.hive.HiveContext +import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer import org.apache.spark.util.Utils +import org.apache.spark.{Logging, SparkFunSuite} object TestData { def getTestDataFilePath(name: String): URL = { @@ -378,6 +377,60 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { rs2.close() } } + + test("test add jar") { + withMultipleConnectionJdbcStatement( + { + statement => + val jarFile = + "../hive/src/test/resources/hive-hcatalog-core-0.13.1.jar" + .split("/") + .mkString(File.separator) + + statement.executeQuery(s"ADD JAR $jarFile") + }, + + { + statement => + val queries = Seq( + "DROP TABLE IF EXISTS smallKV", + "CREATE TABLE smallKV(key INT, val STRING)", + s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE smallKV", + "DROP TABLE IF EXISTS addJar", + """CREATE TABLE addJar(key string) + |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' + """.stripMargin) + + queries.foreach(statement.execute) + + statement.executeQuery( + """ + |INSERT INTO TABLE addJar SELECT 'k1' as key FROM smallKV limit 1 + """.stripMargin) + + val actualResult = + statement.executeQuery("SELECT key FROM addJar") + val actualResultBuffer = new collection.mutable.ArrayBuffer[String]() + while (actualResult.next()) { + actualResultBuffer += actualResult.getString(1) + } + actualResult.close() + + val expectedResult = + statement.executeQuery("SELECT 'k1'") + val expectedResultBuffer = new collection.mutable.ArrayBuffer[String]() + while (expectedResult.next()) { + expectedResultBuffer += expectedResult.getString(1) + } + expectedResult.close() + + assert(expectedResultBuffer === actualResultBuffer) + + statement.executeQuery("DROP TABLE IF EXISTS addJar") + statement.executeQuery("DROP TABLE IF EXISTS smallKV") + } + ) + } } class HiveThriftHttpServerSuite extends HiveThriftJdbcTest { @@ -483,7 +536,7 @@ abstract class HiveThriftServer2Test extends SparkFunSuite with BeforeAndAfterAl val tempLog4jConf = Utils.createTempDir().getCanonicalPath Files.write( - """log4j.rootCategory=INFO, console + """log4j.rootCategory=DEBUG, console |log4j.appender.console=org.apache.log4j.ConsoleAppender |log4j.appender.console.target=System.err |log4j.appender.console.layout=org.apache.log4j.PatternLayout @@ -492,7 +545,7 @@ abstract class HiveThriftServer2Test extends SparkFunSuite with BeforeAndAfterAl new File(s"$tempLog4jConf/log4j.properties"), UTF_8) - tempLog4jConf + File.pathSeparator + sys.props("java.class.path") + tempLog4jConf } s"""$startScript @@ -508,6 +561,20 @@ abstract class HiveThriftServer2Test extends SparkFunSuite with BeforeAndAfterAl """.stripMargin.split("\\s+").toSeq } + /** + * String to scan for when looking for the the thrift binary endpoint running. + * This can change across Hive versions. + */ + val THRIFT_BINARY_SERVICE_LIVE = "Starting ThriftBinaryCLIService on port" + + /** + * String to scan for when looking for the the thrift HTTP endpoint running. + * This can change across Hive versions. + */ + val THRIFT_HTTP_SERVICE_LIVE = "Started ThriftHttpCLIService in http" + + val SERVER_STARTUP_TIMEOUT = 3.minutes + private def startThriftServer(port: Int, attempt: Int) = { warehousePath = Utils.createTempDir() warehousePath.delete() @@ -528,45 +595,59 @@ abstract class HiveThriftServer2Test extends SparkFunSuite with BeforeAndAfterAl logInfo(s"Trying to start HiveThriftServer2: port=$port, mode=$mode, attempt=$attempt") - val env = Seq( - // Disables SPARK_TESTING to exclude log4j.properties in test directories. - "SPARK_TESTING" -> "0", - // Points SPARK_PID_DIR to SPARK_HOME, otherwise only 1 Thrift server instance can be started - // at a time, which is not Jenkins friendly. - "SPARK_PID_DIR" -> pidDir.getCanonicalPath) - - logPath = Process(command, None, env: _*).lines.collectFirst { - case line if line.contains(LOG_FILE_MARK) => new File(line.drop(LOG_FILE_MARK.length)) - }.getOrElse { - throw new RuntimeException("Failed to find HiveThriftServer2 log file.") + logPath = { + val lines = Utils.executeAndGetOutput( + command = command, + extraEnvironment = Map( + // Disables SPARK_TESTING to exclude log4j.properties in test directories. + "SPARK_TESTING" -> "0", + // Points SPARK_PID_DIR to SPARK_HOME, otherwise only 1 Thrift server instance can be + // started at a time, which is not Jenkins friendly. + "SPARK_PID_DIR" -> pidDir.getCanonicalPath), + redirectStderr = true) + + lines.split("\n").collectFirst { + case line if line.contains(LOG_FILE_MARK) => new File(line.drop(LOG_FILE_MARK.length)) + }.getOrElse { + throw new RuntimeException("Failed to find HiveThriftServer2 log file.") + } } val serverStarted = Promise[Unit]() // Ensures that the following "tail" command won't fail. logPath.createNewFile() - logTailingProcess = + val successLines = Seq(THRIFT_BINARY_SERVICE_LIVE, THRIFT_HTTP_SERVICE_LIVE) + + logTailingProcess = { + val command = s"/usr/bin/env tail -n +0 -f ${logPath.getCanonicalPath}".split(" ") // Using "-n +0" to make sure all lines in the log file are checked. - Process(s"/usr/bin/env tail -n +0 -f ${logPath.getCanonicalPath}").run(ProcessLogger( - (line: String) => { - diagnosisBuffer += line + val builder = new ProcessBuilder(command: _*) + val captureOutput = (line: String) => diagnosisBuffer.synchronized { + diagnosisBuffer += line - if (line.contains("ThriftBinaryCLIService listening on") || - line.contains("Started ThriftHttpCLIService in http")) { + successLines.foreach { r => + if (line.contains(r)) { serverStarted.trySuccess(()) - } else if (line.contains("HiveServer2 is stopped")) { - // This log line appears when the server fails to start and terminates gracefully (e.g. - // because of port contention). - serverStarted.tryFailure(new RuntimeException("Failed to start HiveThriftServer2")) } - })) + } + } + + val process = builder.start() + + new ProcessOutputCapturer(process.getInputStream, captureOutput).start() + new ProcessOutputCapturer(process.getErrorStream, captureOutput).start() + process + } - Await.result(serverStarted.future, 2.minute) + Await.result(serverStarted.future, SERVER_STARTUP_TIMEOUT) } private def stopThriftServer(): Unit = { // The `spark-daemon.sh' script uses kill, which is not synchronous, have to wait for a while. - Process(stopScript, None, "SPARK_PID_DIR" -> pidDir.getCanonicalPath).run().exitValue() + Utils.executeAndGetOutput( + command = Seq(stopScript), + extraEnvironment = Map("SPARK_PID_DIR" -> pidDir.getCanonicalPath)) Thread.sleep(3.seconds.toMillis) warehousePath.delete() diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala index 806240e6de45..bf431cd6b026 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala @@ -27,7 +27,6 @@ import org.scalatest.concurrent.Eventually._ import org.scalatest.selenium.WebBrowser import org.scalatest.time.SpanSugar._ -import org.apache.spark.sql.hive.HiveContext import org.apache.spark.ui.SparkUICssErrorHandler class UISeleniumSuite @@ -36,7 +35,6 @@ class UISeleniumSuite implicit var webDriver: WebDriver = _ var server: HiveThriftServer2 = _ - var hc: HiveContext = _ val uiPort = 20000 + Random.nextInt(10000) override def mode: ServerMode.Value = ServerMode.binary diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index 53d5b22b527b..ab309e0a1d36 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.execution import java.io.File import java.util.{Locale, TimeZone} +import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.scalatest.BeforeAndAfter import org.apache.spark.sql.SQLConf @@ -50,6 +51,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5) // Enable in-memory partition pruning for testing purposes TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true) + RuleExecutor.resetTime() } override def afterAll() { @@ -58,6 +60,9 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { Locale.setDefault(originalLocale) TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize) TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) + + // For debugging dump some statistics about how much time was spent in various optimizer rules. + logWarning(RuleExecutor.dumpTimeSpent()) } /** A list of tests deemed out of scope currently and thus completely disregarded. */ @@ -266,8 +271,38 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // Hive returns string from UTC formatted timestamp, spark returns timestamp type "date_udf", + // Can't compare the result that have newline in it + "udf_get_json_object", + // Unlike Hive, we do support log base in (0, 1.0], therefore disable this - "udf7" + "udf7", + + // Trivial changes to DDL output + "compute_stats_empty_table", + "compute_stats_long", + "create_view_translate", + "show_create_table_serde", + "show_tblproperties", + + // Odd changes to output + "merge4", + + // Thift is broken... + "inputddl8", + + // Hive changed ordering of ddl: + "varchar_union1", + + // Parser changes in Hive 1.2 + "input25", + "input26", + + // Uses invalid table name + "innerjoin", + + // classpath problems + "compute_stats.*", + "udf_bitmap_.*" ) /** diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index b00f320318be..0a50a7f2bfb8 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../../pom.xml @@ -36,6 +36,11 @@ + + + com.twitter + parquet-hadoop-bundle + org.apache.spark spark-core_${scala.binary.version} @@ -53,32 +58,42 @@ spark-sql_${scala.binary.version} ${project.version} + - org.codehaus.jackson - jackson-mapper-asl + ${hive.group} + hive-exec + ${hive.group} - hive-serde + hive-metastore + org.apache.avro @@ -91,6 +106,55 @@ avro-mapred ${avro.mapred.classifier} + + commons-httpclient + commons-httpclient + + + org.apache.calcite + calcite-avatica + + + org.apache.calcite + calcite-core + + + org.apache.httpcomponents + httpclient + + + org.codehaus.jackson + jackson-mapper-asl + + + + commons-codec + commons-codec + + + joda-time + joda-time + + + org.jodd + jodd-core + + + com.google.code.findbugs + jsr305 + + + org.datanucleus + datanucleus-core + + + org.apache.thrift + libthrift + + + org.apache.thrift + libfb303 + org.scalacheck scalacheck_${scala.binary.version} diff --git a/sql/hive/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/hive/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister new file mode 100644 index 000000000000..4a774fbf1fdf --- /dev/null +++ b/sql/hive/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -0,0 +1 @@ +org.apache.spark.sql.hive.orc.DefaultSource diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index 110f51a30586..83116e90db3f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -20,15 +20,18 @@ package org.apache.spark.sql.hive import java.io.File import java.net.{URL, URLClassLoader} import java.sql.Timestamp +import java.util.concurrent.TimeUnit import scala.collection.JavaConversions._ import scala.collection.mutable.HashMap import scala.language.implicitConversions +import scala.concurrent.duration._ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.common.StatsSetupConst import org.apache.hadoop.hive.common.`type`.HiveDecimal import org.apache.hadoop.hive.conf.HiveConf +import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hadoop.hive.ql.metadata.Table import org.apache.hadoop.hive.ql.parse.VariableSubstitution import org.apache.hadoop.hive.ql.session.SessionState @@ -40,7 +43,7 @@ import org.apache.spark.annotation.Experimental import org.apache.spark.sql._ import org.apache.spark.sql.SQLConf.SQLConfEntry import org.apache.spark.sql.SQLConf.SQLConfEntry._ -import org.apache.spark.sql.catalyst.{TableIdentifier, ParserDialect} +import org.apache.spark.sql.catalyst.{SqlParser, TableIdentifier, ParserDialect} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUDFs, SetCommand} @@ -108,8 +111,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { * this does not necessarily need to be the same version of Hive that is used internally by * Spark SQL for execution. */ - protected[hive] def hiveMetastoreVersion: String = - getConf(HIVE_METASTORE_VERSION, hiveExecutionVersion) + protected[hive] def hiveMetastoreVersion: String = getConf(HIVE_METASTORE_VERSION) /** * The location of the jars that should be used to instantiate the HiveMetastoreClient. This @@ -164,6 +166,16 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { } SessionState.setCurrentSessionState(executionHive.state) + /** + * Overrides default Hive configurations to avoid breaking changes to Spark SQL users. + * - allow SQL11 keywords to be used as identifiers + */ + private[sql] def defaultOverrides() = { + setConf(ConfVars.HIVE_SUPPORT_SQL11_RESERVED_KEYWORDS.varname, "false") + } + + defaultOverrides() + /** * The copy of the Hive client that is used to retrieve metadata from the Hive MetaStore. * The version of the Hive client that is used here must match the metastore that is configured @@ -176,6 +188,10 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { // We instantiate a HiveConf here to read in the hive-site.xml file and then pass the options // into the isolated client loader val metadataConf = new HiveConf() + + val defaultWarehouseLocation = metadataConf.get("hive.metastore.warehouse.dir") + logInfo("default warehouse location is " + defaultWarehouseLocation) + // `configure` goes second to override other settings. val allConfig = metadataConf.iterator.map(e => e.getKey -> e.getValue).toMap ++ configure @@ -185,7 +201,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { "Builtin jars can only be used when hive execution version == hive metastore version. " + s"Execution: ${hiveExecutionVersion} != Metastore: ${hiveMetastoreVersion}. " + "Specify a vaild path to the correct hive jars using $HIVE_METASTORE_JARS " + - s"or change $HIVE_METASTORE_VERSION to $hiveExecutionVersion.") + s"or change ${HIVE_METASTORE_VERSION.key} to $hiveExecutionVersion.") } // We recursively find all jars in the class loader chain, @@ -218,7 +234,11 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { // TODO: Support for loading the jars from an already downloaded location. logInfo( s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion using maven.") - IsolatedClientLoader.forVersion(hiveMetastoreVersion, allConfig) + IsolatedClientLoader.forVersion( + version = hiveMetastoreVersion, + config = allConfig, + barrierPrefixes = hiveMetastoreBarrierPrefixes, + sharedPrefixes = hiveMetastoreSharedPrefixes) } else { // Convert to files and expand any directories. val jars = @@ -252,6 +272,10 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { } protected[sql] override def parseSql(sql: String): LogicalPlan = { + var state = SessionState.get() + if (state == null) { + SessionState.setCurrentSessionState(tlSession.get().asInstanceOf[SQLSession].sessionState) + } super.parseSql(substitutor.substitute(hiveconf, sql)) } @@ -267,12 +291,13 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { * @since 1.3.0 */ def refreshTable(tableName: String): Unit = { - val tableIdent = TableIdentifier(tableName).withDatabase(catalog.client.currentDatabase) + val tableIdent = new SqlParser().parseTableIdentifier(tableName) catalog.refreshTable(tableIdent) } protected[hive] def invalidateTable(tableName: String): Unit = { - catalog.invalidateTable(catalog.client.currentDatabase, tableName) + val tableIdent = new SqlParser().parseTableIdentifier(tableName) + catalog.invalidateTable(tableIdent) } /** @@ -286,7 +311,8 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { */ @Experimental def analyze(tableName: String) { - val relation = EliminateSubQueries(catalog.lookupRelation(Seq(tableName))) + val tableIdent = new SqlParser().parseTableIdentifier(tableName) + val relation = EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq)) relation match { case relation: MetastoreRelation => @@ -298,10 +324,21 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { // Can we use fs.getContentSummary in future? // Seems fs.getContentSummary returns wrong table size on Jenkins. So we use // countFileSize to count the table size. + val stagingDir = metadataHive.getConf(HiveConf.ConfVars.STAGINGDIR.varname, + HiveConf.ConfVars.STAGINGDIR.defaultStrVal) + def calculateTableSize(fs: FileSystem, path: Path): Long = { val fileStatus = fs.getFileStatus(path) val size = if (fileStatus.isDir) { - fs.listStatus(path).map(status => calculateTableSize(fs, status.getPath)).sum + fs.listStatus(path) + .map { status => + if (!status.getPath().getName().startsWith(stagingDir)) { + calculateTableSize(fs, status.getPath) + } else { + 0L + } + } + .sum } else { fileStatus.getLen } @@ -398,7 +435,58 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { } /** Overridden by child classes that need to set configuration before the client init. */ - protected def configure(): Map[String, String] = Map.empty + protected def configure(): Map[String, String] = { + // Hive 0.14.0 introduces timeout operations in HiveConf, and changes default values of a bunch + // of time `ConfVar`s by adding time suffixes (`s`, `ms`, and `d` etc.). This breaks backwards- + // compatibility when users are trying to connecting to a Hive metastore of lower version, + // because these options are expected to be integral values in lower versions of Hive. + // + // Here we enumerate all time `ConfVar`s and convert their values to numeric strings according + // to their output time units. + Seq( + ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY -> TimeUnit.SECONDS, + ConfVars.METASTORE_CLIENT_SOCKET_TIMEOUT -> TimeUnit.SECONDS, + ConfVars.METASTORE_CLIENT_SOCKET_LIFETIME -> TimeUnit.SECONDS, + ConfVars.HMSHANDLERINTERVAL -> TimeUnit.MILLISECONDS, + ConfVars.METASTORE_EVENT_DB_LISTENER_TTL -> TimeUnit.SECONDS, + ConfVars.METASTORE_EVENT_CLEAN_FREQ -> TimeUnit.SECONDS, + ConfVars.METASTORE_EVENT_EXPIRY_DURATION -> TimeUnit.SECONDS, + ConfVars.METASTORE_AGGREGATE_STATS_CACHE_TTL -> TimeUnit.SECONDS, + ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_WRITER_WAIT -> TimeUnit.MILLISECONDS, + ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_READER_WAIT -> TimeUnit.MILLISECONDS, + ConfVars.HIVES_AUTO_PROGRESS_TIMEOUT -> TimeUnit.SECONDS, + ConfVars.HIVE_LOG_INCREMENTAL_PLAN_PROGRESS_INTERVAL -> TimeUnit.MILLISECONDS, + ConfVars.HIVE_STATS_JDBC_TIMEOUT -> TimeUnit.SECONDS, + ConfVars.HIVE_STATS_RETRIES_WAIT -> TimeUnit.MILLISECONDS, + ConfVars.HIVE_LOCK_SLEEP_BETWEEN_RETRIES -> TimeUnit.SECONDS, + ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT -> TimeUnit.MILLISECONDS, + ConfVars.HIVE_ZOOKEEPER_CONNECTION_BASESLEEPTIME -> TimeUnit.MILLISECONDS, + ConfVars.HIVE_TXN_TIMEOUT -> TimeUnit.SECONDS, + ConfVars.HIVE_COMPACTOR_WORKER_TIMEOUT -> TimeUnit.SECONDS, + ConfVars.HIVE_COMPACTOR_CHECK_INTERVAL -> TimeUnit.SECONDS, + ConfVars.HIVE_COMPACTOR_CLEANER_RUN_INTERVAL -> TimeUnit.MILLISECONDS, + ConfVars.HIVE_SERVER2_THRIFT_HTTP_MAX_IDLE_TIME -> TimeUnit.MILLISECONDS, + ConfVars.HIVE_SERVER2_THRIFT_HTTP_WORKER_KEEPALIVE_TIME -> TimeUnit.SECONDS, + ConfVars.HIVE_SERVER2_THRIFT_HTTP_COOKIE_MAX_AGE -> TimeUnit.SECONDS, + ConfVars.HIVE_SERVER2_THRIFT_LOGIN_BEBACKOFF_SLOT_LENGTH -> TimeUnit.MILLISECONDS, + ConfVars.HIVE_SERVER2_THRIFT_LOGIN_TIMEOUT -> TimeUnit.SECONDS, + ConfVars.HIVE_SERVER2_THRIFT_WORKER_KEEPALIVE_TIME -> TimeUnit.SECONDS, + ConfVars.HIVE_SERVER2_ASYNC_EXEC_SHUTDOWN_TIMEOUT -> TimeUnit.SECONDS, + ConfVars.HIVE_SERVER2_ASYNC_EXEC_KEEPALIVE_TIME -> TimeUnit.SECONDS, + ConfVars.HIVE_SERVER2_LONG_POLLING_TIMEOUT -> TimeUnit.MILLISECONDS, + ConfVars.HIVE_SERVER2_SESSION_CHECK_INTERVAL -> TimeUnit.MILLISECONDS, + ConfVars.HIVE_SERVER2_IDLE_SESSION_TIMEOUT -> TimeUnit.MILLISECONDS, + ConfVars.HIVE_SERVER2_IDLE_OPERATION_TIMEOUT -> TimeUnit.MILLISECONDS, + ConfVars.SERVER_READ_SOCKET_TIMEOUT -> TimeUnit.SECONDS, + ConfVars.HIVE_LOCALIZE_RESOURCE_WAIT_INTERVAL -> TimeUnit.MILLISECONDS, + ConfVars.SPARK_CLIENT_FUTURE_TIMEOUT -> TimeUnit.SECONDS, + ConfVars.SPARK_JOB_MONITOR_TIMEOUT -> TimeUnit.SECONDS, + ConfVars.SPARK_RPC_CLIENT_CONNECT_TIMEOUT -> TimeUnit.MILLISECONDS, + ConfVars.SPARK_RPC_CLIENT_HANDSHAKE_TIMEOUT -> TimeUnit.MILLISECONDS + ).map { case (confVar, unit) => + confVar.varname -> hiveconf.getTimeVar(confVar, unit).toString + }.toMap + } protected[hive] class SQLSession extends super.SQLSession { protected[sql] override lazy val conf: SQLConf = new SQLConf { @@ -452,7 +540,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { HashAggregation, Aggregation, LeftSemiJoin, - HashJoin, + EquiJoinSelection, BasicOperators, CartesianProduct, BroadcastNestedLoopJoin @@ -515,19 +603,27 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { private[hive] object HiveContext { /** The version of hive used internally by Spark SQL. */ - val hiveExecutionVersion: String = "0.13.1" + val hiveExecutionVersion: String = "1.2.1" + + val HIVE_METASTORE_VERSION = stringConf("spark.sql.hive.metastore.version", + defaultValue = Some(hiveExecutionVersion), + doc = "Version of the Hive metastore. Available options are " + + s"0.12.0 through $hiveExecutionVersion.") - val HIVE_METASTORE_VERSION: String = "spark.sql.hive.metastore.version" val HIVE_METASTORE_JARS = stringConf("spark.sql.hive.metastore.jars", defaultValue = Some("builtin"), - doc = "Location of the jars that should be used to instantiate the HiveMetastoreClient. This" + - " property can be one of three options: " + - "1. \"builtin\" Use Hive 0.13.1, which is bundled with the Spark assembly jar when " + - "-Phive is enabled. When this option is chosen, " + - "spark.sql.hive.metastore.version must be either 0.13.1 or not defined. " + - "2. \"maven\" Use Hive jars of specified version downloaded from Maven repositories." + - "3. A classpath in the standard format for both Hive and Hadoop.") - + doc = s""" + | Location of the jars that should be used to instantiate the HiveMetastoreClient. + | This property can be one of three options: " + | 1. "builtin" + | Use Hive ${hiveExecutionVersion}, which is bundled with the Spark assembly jar when + | -Phive is enabled. When this option is chosen, + | spark.sql.hive.metastore.version must be either + | ${hiveExecutionVersion} or not defined. + | 2. "maven" + | Use Hive jars of specified version downloaded from Maven repositories. + | 3. A classpath in the standard format for both Hive and Hadoop. + """.stripMargin) val CONVERT_METASTORE_PARQUET = booleanConf("spark.sql.hive.convertMetastoreParquet", defaultValue = Some(true), doc = "When set to false, Spark SQL will use the Hive SerDe for parquet tables instead of " + @@ -566,17 +662,18 @@ private[hive] object HiveContext { /** Constructs a configuration for hive, where the metastore is located in a temp directory. */ def newTemporaryConfiguration(): Map[String, String] = { val tempDir = Utils.createTempDir() - val localMetastore = new File(tempDir, "metastore").getAbsolutePath + val localMetastore = new File(tempDir, "metastore") val propMap: HashMap[String, String] = HashMap() // We have to mask all properties in hive-site.xml that relates to metastore data source // as we used a local metastore here. HiveConf.ConfVars.values().foreach { confvar => if (confvar.varname.contains("datanucleus") || confvar.varname.contains("jdo")) { - propMap.put(confvar.varname, confvar.defaultVal) + propMap.put(confvar.varname, confvar.getDefaultExpr()) } } - propMap.put("javax.jdo.option.ConnectionURL", - s"jdbc:derby:;databaseName=$localMetastore;create=true") + propMap.put(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, localMetastore.toURI.toString) + propMap.put(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, + s"jdbc:derby:;databaseName=${localMetastore.getAbsolutePath};create=true") propMap.put("datanucleus.rdbms.datastoreAdapterClassName", "org.datanucleus.store.rdbms.adapter.DerbyAdapter") propMap.toMap diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 39d798d072ae..64fffdbf9b02 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -370,17 +370,36 @@ private[hive] trait HiveInspectors { protected def wrapperFor(oi: ObjectInspector, dataType: DataType): Any => Any = oi match { case _: JavaHiveVarcharObjectInspector => (o: Any) => - val s = o.asInstanceOf[UTF8String].toString - new HiveVarchar(s, s.size) + if (o != null) { + val s = o.asInstanceOf[UTF8String].toString + new HiveVarchar(s, s.size) + } else { + null + } case _: JavaHiveDecimalObjectInspector => - (o: Any) => HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal) + (o: Any) => + if (o != null) { + HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal) + } else { + null + } case _: JavaDateObjectInspector => - (o: Any) => DateTimeUtils.toJavaDate(o.asInstanceOf[Int]) + (o: Any) => + if (o != null) { + DateTimeUtils.toJavaDate(o.asInstanceOf[Int]) + } else { + null + } case _: JavaTimestampObjectInspector => - (o: Any) => DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long]) + (o: Any) => + if (o != null) { + DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long]) + } else { + null + } case soi: StandardStructObjectInspector => val schema = dataType.asInstanceOf[StructType] @@ -390,8 +409,10 @@ private[hive] trait HiveInspectors { (o: Any) => { if (o != null) { val struct = soi.create() - (soi.getAllStructFieldRefs, wrappers, o.asInstanceOf[InternalRow].toSeq).zipped.foreach { - (field, wrapper, data) => soi.setStructFieldData(struct, field, wrapper(data)) + val row = o.asInstanceOf[InternalRow] + soi.getAllStructFieldRefs.zip(wrappers).zipWithIndex.foreach { + case ((field, wrapper), i) => + soi.setStructFieldData(struct, field, wrapper(row.get(i, schema(i).dataType))) } struct } else { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index a8c9b4fa71b9..98d21aa76d64 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -18,11 +18,13 @@ package org.apache.spark.sql.hive import scala.collection.JavaConversions._ +import scala.collection.mutable import com.google.common.base.Objects import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.common.StatsSetupConst +import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.Warehouse import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.ql.metadata._ @@ -31,18 +33,67 @@ import org.apache.hadoop.hive.ql.plan.TableDesc import org.apache.spark.Logging import org.apache.spark.sql.catalyst.analysis.{Catalog, MultiInstanceRelation, OverrideCatalog} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.catalyst.{InternalRow, SqlParser, TableIdentifier} -import org.apache.spark.sql.execution.datasources +import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource} +import org.apache.spark.sql.execution.{FileRelation, datasources} import org.apache.spark.sql.hive.client._ -import org.apache.spark.sql.parquet.ParquetRelation +import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode} +private[hive] case class HiveSerDe( + inputFormat: Option[String] = None, + outputFormat: Option[String] = None, + serde: Option[String] = None) + +private[hive] object HiveSerDe { + /** + * Get the Hive SerDe information from the data source abbreviation string or classname. + * + * @param source Currently the source abbreviation can be one of the following: + * SequenceFile, RCFile, ORC, PARQUET, and case insensitive. + * @param hiveConf Hive Conf + * @return HiveSerDe associated with the specified source + */ + def sourceToSerDe(source: String, hiveConf: HiveConf): Option[HiveSerDe] = { + val serdeMap = Map( + "sequencefile" -> + HiveSerDe( + inputFormat = Option("org.apache.hadoop.mapred.SequenceFileInputFormat"), + outputFormat = Option("org.apache.hadoop.mapred.SequenceFileOutputFormat")), + + "rcfile" -> + HiveSerDe( + inputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileInputFormat"), + outputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"), + serde = Option(hiveConf.getVar(HiveConf.ConfVars.HIVEDEFAULTRCFILESERDE))), + + "orc" -> + HiveSerDe( + inputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"), + outputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"), + serde = Option("org.apache.hadoop.hive.ql.io.orc.OrcSerde")), + + "parquet" -> + HiveSerDe( + inputFormat = Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"), + outputFormat = Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"), + serde = Option("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))) + + val key = source.toLowerCase match { + case s if s.startsWith("org.apache.spark.sql.parquet") => "parquet" + case s if s.startsWith("org.apache.spark.sql.orc") => "orc" + case s => s + } + + serdeMap.get(key) + } +} + private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: HiveContext) extends Catalog with Logging { @@ -123,10 +174,13 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive // it is better at here to invalidate the cache to avoid confusing waring logs from the // cache loader (e.g. cannot find data source provider, which is only defined for // data source table.). - invalidateTable(tableIdent.database.getOrElse(client.currentDatabase), tableIdent.table) + invalidateTable(tableIdent) } - def invalidateTable(databaseName: String, tableName: String): Unit = { + def invalidateTable(tableIdent: TableIdentifier): Unit = { + val databaseName = tableIdent.database.getOrElse(client.currentDatabase) + val tableName = tableIdent.table + cachedDataSourceTables.invalidate(QualifiedTableName(databaseName, tableName).toLowerCase) } @@ -136,6 +190,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive * Creates a data source table (a table created with USING clause) in Hive's metastore. * Returns true when the table has been created. Otherwise, false. */ + // TODO: Remove this in SPARK-10104. def createDataSourceTable( tableName: String, userSpecifiedSchema: Option[StructType], @@ -152,7 +207,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive isExternal) } - private def createDataSourceTable( + def createDataSourceTable( tableIdent: TableIdentifier, userSpecifiedSchema: Option[StructType], partitionColumns: Array[String], @@ -164,15 +219,15 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive processDatabaseAndTableName(database, tableIdent.table) } - val tableProperties = new scala.collection.mutable.HashMap[String, String] + val tableProperties = new mutable.HashMap[String, String] tableProperties.put("spark.sql.sources.provider", provider) // Saves optional user specified schema. Serialized JSON schema string may be too long to be // stored into a single metastore SerDe property. In this case, we split the JSON string and // store each part as a separate SerDe property. - if (userSpecifiedSchema.isDefined) { + userSpecifiedSchema.foreach { schema => val threshold = conf.schemaStringLengthThreshold - val schemaJsonString = userSpecifiedSchema.get.json + val schemaJsonString = schema.json // Split the JSON string. val parts = schemaJsonString.grouped(threshold).toSeq tableProperties.put("spark.sql.sources.schema.numParts", parts.size.toString) @@ -194,7 +249,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive // The table does not have a specified schema, which means that the schema will be inferred // when we load the table. So, we are not expecting partition columns and we will discover // partitions when we load the table. However, if there are specified partition columns, - // we simplily ignore them and provide a warning message.. + // we simply ignore them and provide a warning message. logWarning( s"The schema and partitions of table $tableIdent will be inferred when it is loaded. " + s"Specified partition columns (${partitionColumns.mkString(",")}) will be ignored.") @@ -210,7 +265,11 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive ManagedTable } - client.createTable( + val maybeSerDe = HiveSerDe.sourceToSerDe(provider, hive.hiveconf) + val dataSource = ResolvedDataSource( + hive, userSpecifiedSchema, partitionColumns, provider, options) + + def newSparkSQLSpecificMetastoreTable(): HiveTable = { HiveTable( specifiedDatabase = Option(dbName), name = tblName, @@ -218,14 +277,114 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive partitionColumns = metastorePartitionColumns, tableType = tableType, properties = tableProperties.toMap, - serdeProperties = options)) + serdeProperties = options) + } + + def newHiveCompatibleMetastoreTable(relation: HadoopFsRelation, serde: HiveSerDe): HiveTable = { + def schemaToHiveColumn(schema: StructType): Seq[HiveColumn] = { + schema.map { field => + HiveColumn( + name = field.name, + hiveType = HiveMetastoreTypes.toMetastoreType(field.dataType), + comment = "") + } + } + + val partitionColumns = schemaToHiveColumn(relation.partitionColumns) + val dataColumns = schemaToHiveColumn(relation.schema).filterNot(partitionColumns.contains) + + HiveTable( + specifiedDatabase = Option(dbName), + name = tblName, + schema = dataColumns, + partitionColumns = partitionColumns, + tableType = tableType, + properties = tableProperties.toMap, + serdeProperties = options, + location = Some(relation.paths.head), + viewText = None, // TODO We need to place the SQL string here. + inputFormat = serde.inputFormat, + outputFormat = serde.outputFormat, + serde = serde.serde) + } + + // TODO: Support persisting partitioned data source relations in Hive compatible format + val hiveTable = (maybeSerDe, dataSource.relation) match { + case (Some(serde), relation: HadoopFsRelation) + if relation.paths.length == 1 && relation.partitionColumns.isEmpty => + // Hive ParquetSerDe doesn't support decimal type until 1.2.0. + val isParquetSerDe = serde.inputFormat.exists(_.toLowerCase.contains("parquet")) + val hasDecimalFields = relation.schema.existsRecursively(_.isInstanceOf[DecimalType]) + + val hiveParquetSupportsDecimal = client.version match { + case org.apache.spark.sql.hive.client.hive.v1_2 => true + case _ => false + } + + if (isParquetSerDe && !hiveParquetSupportsDecimal && hasDecimalFields) { + // If Hive version is below 1.2.0, we cannot save Hive compatible schema to + // metastore when the file format is Parquet and the schema has DecimalType. + logWarning { + "Persisting Parquet relation with decimal field(s) into Hive metastore in Spark SQL " + + "specific format, which is NOT compatible with Hive. Because ParquetHiveSerDe in " + + s"Hive ${client.version.fullVersion} doesn't support decimal type. See HIVE-6384." + } + newSparkSQLSpecificMetastoreTable() + } else { + logInfo { + "Persisting data source relation with a single input path into Hive metastore in " + + s"Hive compatible format. Input path: ${relation.paths.head}" + } + newHiveCompatibleMetastoreTable(relation, serde) + } + + case (Some(serde), relation: HadoopFsRelation) if relation.partitionColumns.nonEmpty => + logWarning { + "Persisting partitioned data source relation into Hive metastore in " + + s"Spark SQL specific format, which is NOT compatible with Hive. Input path(s): " + + relation.paths.mkString("\n", "\n", "") + } + newSparkSQLSpecificMetastoreTable() + + case (Some(serde), relation: HadoopFsRelation) => + logWarning { + "Persisting data source relation with multiple input paths into Hive metastore in " + + s"Spark SQL specific format, which is NOT compatible with Hive. Input paths: " + + relation.paths.mkString("\n", "\n", "") + } + newSparkSQLSpecificMetastoreTable() + + case (Some(serde), _) => + logWarning { + s"Data source relation is not a ${classOf[HadoopFsRelation].getSimpleName}. " + + "Persisting it into Hive metastore in Spark SQL specific format, " + + "which is NOT compatible with Hive." + } + newSparkSQLSpecificMetastoreTable() + + case _ => + logWarning { + s"Couldn't find corresponding Hive SerDe for data source provider $provider. " + + "Persisting data source relation into Hive metastore in Spark SQL specific format, " + + "which is NOT compatible with Hive." + } + newSparkSQLSpecificMetastoreTable() + } + + client.createTable(hiveTable) } def hiveDefaultTableFilePath(tableName: String): String = { + hiveDefaultTableFilePath(new SqlParser().parseTableIdentifier(tableName)) + } + + def hiveDefaultTableFilePath(tableIdent: TableIdentifier): String = { // Code based on: hiveWarehouse.getTablePath(currentDatabase, tableName) + val database = tableIdent.database.getOrElse(client.currentDatabase) + new Path( - new Path(client.getDatabase(client.currentDatabase).location), - tableName.toLowerCase).toString + new Path(client.getDatabase(database).location), + tableIdent.table.toLowerCase).toString } def tableExists(tableIdentifier: Seq[String]): Boolean = { @@ -391,7 +550,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive */ object ParquetConversions extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { - if (!plan.resolved) { + if (!plan.resolved || plan.analyzed) { return plan } @@ -418,8 +577,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive (relation, parquetRelation, attributedRewrites) // Read path - case p @ PhysicalOperation(_, _, relation: MetastoreRelation) - if hive.convertMetastoreParquet && + case relation: MetastoreRelation if hive.convertMetastoreParquet && relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") => val parquetRelation = convertToParquetRelation(relation) val attributedRewrites = relation.output.zip(parquetRelation.output) @@ -464,7 +622,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive case p: LogicalPlan if !p.childrenResolved => p case p: LogicalPlan if p.resolved => p case p @ CreateTableAsSelect(table, child, allowExisting) => - val schema = if (table.schema.size > 0) { + val schema = if (table.schema.nonEmpty) { table.schema } else { child.output.map { @@ -487,7 +645,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive val mode = if (allowExisting) SaveMode.Ignore else SaveMode.ErrorIfExists CreateTableUsingAsSelect( - desc.name, + TableIdentifier(desc.name), hive.conf.defaultDataSourceName, temporary = false, Array.empty[String], @@ -593,7 +751,7 @@ private[hive] case class InsertIntoHiveTable( extends LogicalPlan { override def children: Seq[LogicalPlan] = child :: Nil - override def output: Seq[Attribute] = child.output + override def output: Seq[Attribute] = Seq.empty val numDynamicPartitions = partition.values.count(_.isEmpty) @@ -610,7 +768,7 @@ private[hive] case class MetastoreRelation (databaseName: String, tableName: String, alias: Option[String]) (val table: HiveTable) (@transient sqlContext: SQLContext) - extends LeafNode with MultiInstanceRelation { + extends LeafNode with MultiInstanceRelation with FileRelation { override def equals(other: Any): Boolean = other match { case relation: MetastoreRelation => @@ -649,11 +807,12 @@ private[hive] case class MetastoreRelation table.outputFormat.foreach(sd.setOutputFormat) val serdeInfo = new org.apache.hadoop.hive.metastore.api.SerDeInfo - sd.setSerdeInfo(serdeInfo) table.serde.foreach(serdeInfo.setSerializationLib) + sd.setSerdeInfo(serdeInfo) + val serdeParameters = new java.util.HashMap[String, String]() - serdeInfo.setParameters(serdeParameters) table.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) } + serdeInfo.setParameters(serdeParameters) new Table(tTable) } @@ -758,6 +917,18 @@ private[hive] case class MetastoreRelation /** An attribute map for determining the ordinal for non-partition columns. */ val columnOrdinals = AttributeMap(attributes.zipWithIndex) + override def inputFiles: Array[String] = { + val partLocations = table.getPartitions(Nil).map(_.storage.location).toArray + if (partLocations.nonEmpty) { + partLocations + } else { + Array( + table.location.getOrElse( + sys.error(s"Could not get the location of ${table.qualifiedName}."))) + } + } + + override def newInstance(): MetastoreRelation = { MetastoreRelation(databaseName, tableName, alias)(table)(sqlContext) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index e6df64d2642b..ad33dee555dd 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -18,8 +18,10 @@ package org.apache.spark.sql.hive import java.sql.Date +import java.util.Locale import org.apache.hadoop.hive.conf.HiveConf +import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hadoop.hive.serde.serdeConstants import org.apache.hadoop.hive.ql.{ErrorMsg, Context} import org.apache.hadoop.hive.ql.exec.{FunctionRegistry, FunctionInfo} @@ -30,6 +32,7 @@ import org.apache.hadoop.hive.ql.session.SessionState import org.apache.spark.Logging import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ @@ -42,6 +45,7 @@ import org.apache.spark.sql.hive.HiveShim._ import org.apache.spark.sql.hive.client._ import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable, HiveScriptIOSchema} import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.random.RandomSampler /* Implicit conversions */ @@ -80,6 +84,7 @@ private[hive] object HiveQl extends Logging { "TOK_ALTERDATABASE_PROPERTIES", "TOK_ALTERINDEX_PROPERTIES", "TOK_ALTERINDEX_REBUILD", + "TOK_ALTERTABLE", "TOK_ALTERTABLE_ADDCOLS", "TOK_ALTERTABLE_ADDPARTS", "TOK_ALTERTABLE_ALTERPARTS", @@ -94,6 +99,7 @@ private[hive] object HiveQl extends Logging { "TOK_ALTERTABLE_SKEWED", "TOK_ALTERTABLE_TOUCH", "TOK_ALTERTABLE_UNARCHIVE", + "TOK_ALTERVIEW", "TOK_ALTERVIEW_ADDPARTS", "TOK_ALTERVIEW_AS", "TOK_ALTERVIEW_DROPPARTS", @@ -248,7 +254,7 @@ private[hive] object HiveQl extends Logging { * Otherwise, there will be Null pointer exception, * when retrieving properties form HiveConf. */ - val hContext = new Context(hiveConf) + val hContext = new Context(SessionState.get().getConf()) val node = ParseUtils.findRootNonNullToken((new ParseDriver).parse(sql, hContext)) hContext.clear() node @@ -257,8 +263,8 @@ private[hive] object HiveQl extends Logging { /** * Returns the HiveConf */ - private[this] def hiveConf(): HiveConf = { - val ss = SessionState.get() // SessionState is lazy initializaion, it can be null here + private[this] def hiveConf: HiveConf = { + val ss = SessionState.get() // SessionState is lazy initialization, it can be null here if (ss == null) { new HiveConf() } else { @@ -577,12 +583,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C "TOK_TABLESKEWED", // Skewed by "TOK_TABLEROWFORMAT", "TOK_TABLESERIALIZER", - "TOK_FILEFORMAT_GENERIC", // For file formats not natively supported by Hive. - "TOK_TBLSEQUENCEFILE", // Stored as SequenceFile - "TOK_TBLTEXTFILE", // Stored as TextFile - "TOK_TBLRCFILE", // Stored as RCFile - "TOK_TBLORCFILE", // Stored as ORC File - "TOK_TBLPARQUETFILE", // Stored as PARQUET + "TOK_FILEFORMAT_GENERIC", "TOK_TABLEFILEFORMAT", // User-provided InputFormat and OutputFormat "TOK_STORAGEHANDLER", // Storage handler "TOK_TABLELOCATION", @@ -605,38 +606,18 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C serde = None, viewText = None) - // default storage type abbriviation (e.g. RCFile, ORC, PARQUET etc.) + // default storage type abbreviation (e.g. RCFile, ORC, PARQUET etc.) val defaultStorageType = hiveConf.getVar(HiveConf.ConfVars.HIVEDEFAULTFILEFORMAT) - // handle the default format for the storage type abbriviation - tableDesc = if ("SequenceFile".equalsIgnoreCase(defaultStorageType)) { - tableDesc.copy( - inputFormat = Option("org.apache.hadoop.mapred.SequenceFileInputFormat"), - outputFormat = Option("org.apache.hadoop.mapred.SequenceFileOutputFormat")) - } else if ("RCFile".equalsIgnoreCase(defaultStorageType)) { - tableDesc.copy( - inputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileInputFormat"), - outputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"), - serde = Option(hiveConf.getVar(HiveConf.ConfVars.HIVEDEFAULTRCFILESERDE))) - } else if ("ORC".equalsIgnoreCase(defaultStorageType)) { - tableDesc.copy( - inputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"), - outputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"), - serde = Option("org.apache.hadoop.hive.ql.io.orc.OrcSerde")) - } else if ("PARQUET".equalsIgnoreCase(defaultStorageType)) { - tableDesc.copy( - inputFormat = - Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"), - outputFormat = - Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"), - serde = - Option("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")) - } else { - tableDesc.copy( - inputFormat = - Option("org.apache.hadoop.mapred.TextInputFormat"), - outputFormat = - Option("org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat")) - } + // handle the default format for the storage type abbreviation + val hiveSerDe = HiveSerDe.sourceToSerDe(defaultStorageType, hiveConf).getOrElse { + HiveSerDe( + inputFormat = Option("org.apache.hadoop.mapred.TextInputFormat"), + outputFormat = Option("org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat")) + } + + hiveSerDe.inputFormat.foreach(f => tableDesc = tableDesc.copy(inputFormat = Some(f))) + hiveSerDe.outputFormat.foreach(f => tableDesc = tableDesc.copy(outputFormat = Some(f))) + hiveSerDe.serde.foreach(f => tableDesc = tableDesc.copy(serde = Some(f))) children.collect { case list @ Token("TOK_TABCOLLIST", _) => @@ -706,36 +687,62 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C tableDesc = tableDesc.copy(serdeProperties = tableDesc.serdeProperties ++ serdeParams) } case Token("TOK_FILEFORMAT_GENERIC", child :: Nil) => - throw new SemanticException( - "Unrecognized file format in STORED AS clause:${child.getText}") + child.getText().toLowerCase(Locale.ENGLISH) match { + case "orc" => + tableDesc = tableDesc.copy( + inputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"), + outputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat")) + if (tableDesc.serde.isEmpty) { + tableDesc = tableDesc.copy( + serde = Option("org.apache.hadoop.hive.ql.io.orc.OrcSerde")) + } - case Token("TOK_TBLRCFILE", Nil) => - tableDesc = tableDesc.copy( - inputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileInputFormat"), - outputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileOutputFormat")) - if (tableDesc.serde.isEmpty) { - tableDesc = tableDesc.copy( - serde = Option("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe")) - } + case "parquet" => + tableDesc = tableDesc.copy( + inputFormat = + Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"), + outputFormat = + Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")) + if (tableDesc.serde.isEmpty) { + tableDesc = tableDesc.copy( + serde = Option("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")) + } - case Token("TOK_TBLORCFILE", Nil) => - tableDesc = tableDesc.copy( - inputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"), - outputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat")) - if (tableDesc.serde.isEmpty) { - tableDesc = tableDesc.copy( - serde = Option("org.apache.hadoop.hive.ql.io.orc.OrcSerde")) - } + case "rcfile" => + tableDesc = tableDesc.copy( + inputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileInputFormat"), + outputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileOutputFormat")) + if (tableDesc.serde.isEmpty) { + tableDesc = tableDesc.copy( + serde = Option("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe")) + } - case Token("TOK_TBLPARQUETFILE", Nil) => - tableDesc = tableDesc.copy( - inputFormat = - Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"), - outputFormat = - Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")) - if (tableDesc.serde.isEmpty) { - tableDesc = tableDesc.copy( - serde = Option("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")) + case "textfile" => + tableDesc = tableDesc.copy( + inputFormat = + Option("org.apache.hadoop.mapred.TextInputFormat"), + outputFormat = + Option("org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat")) + + case "sequencefile" => + tableDesc = tableDesc.copy( + inputFormat = Option("org.apache.hadoop.mapred.SequenceFileInputFormat"), + outputFormat = Option("org.apache.hadoop.mapred.SequenceFileOutputFormat")) + + case "avro" => + tableDesc = tableDesc.copy( + inputFormat = + Option("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat"), + outputFormat = + Option("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat")) + if (tableDesc.serde.isEmpty) { + tableDesc = tableDesc.copy( + serde = Option("org.apache.hadoop.hive.serde2.avro.AvroSerDe")) + } + + case _ => + throw new SemanticException( + s"Unrecognized file format in STORED AS clause: ${child.getText}") } case Token("TOK_TABLESERIALIZER", @@ -751,7 +758,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C case Token("TOK_TABLEPROPERTIES", list :: Nil) => tableDesc = tableDesc.copy(properties = tableDesc.properties ++ getProperties(list)) - case list @ Token("TOK_TABLEFILEFORMAT", _) => + case list @ Token("TOK_TABLEFILEFORMAT", children) => tableDesc = tableDesc.copy( inputFormat = Option(BaseSemanticAnalyzer.unescapeSQLString(list.getChild(0).getText)), @@ -889,11 +896,12 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C Token("TOK_TABLEPROPLIST", propsClause) :: Nil) :: Nil) :: Nil => val serdeProps = propsClause.map { case Token("TOK_TABLEPROPERTY", Token(name, Nil) :: Token(value, Nil) :: Nil) => - (name, value) + (BaseSemanticAnalyzer.unescapeSQLString(name), + BaseSemanticAnalyzer.unescapeSQLString(value)) } (Nil, Some(BaseSemanticAnalyzer.unescapeSQLString(serdeClass)), serdeProps) - case Nil => (Nil, None, Nil) + case Nil => (Nil, Option(hiveConf.getVar(ConfVars.HIVESCRIPTSERDE)), Nil) } val (inRowFormat, inSerdeClass, inSerdeProps) = matchSerDe(inputSerdeClause) @@ -1037,10 +1045,11 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C // return With plan if there is CTE cteRelations.map(With(query, _)).getOrElse(query) - case Token("TOK_UNION", left :: right :: Nil) => Union(nodeToPlan(left), nodeToPlan(right)) + // HIVE-9039 renamed TOK_UNION => TOK_UNIONALL while adding TOK_UNIONDISTINCT + case Token("TOK_UNIONALL", left :: right :: Nil) => Union(nodeToPlan(left), nodeToPlan(right)) case a: ASTNode => - throw new NotImplementedError(s"No parse rules for:\n ${dumpTree(a).toString} ") + throw new NotImplementedError(s"No parse rules for $node:\n ${dumpTree(a).toString} ") } val allJoinTokens = "(TOK_.*JOIN)".r @@ -1251,7 +1260,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C InsertIntoTable(UnresolvedRelation(tableIdent, None), partitionKeys, query, overwrite, true) case a: ASTNode => - throw new NotImplementedError(s"No parse rules for:\n ${dumpTree(a).toString} ") + throw new NotImplementedError(s"No parse rules for ${a.getName}:" + + s"\n ${dumpTree(a).toString} ") } protected def selExprNodeToExpr(node: Node): Option[Expression] = node match { @@ -1274,7 +1284,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C case Token("TOK_HINTLIST", _) => None case a: ASTNode => - throw new NotImplementedError(s"No parse rules for:\n ${dumpTree(a).toString} ") + throw new NotImplementedError(s"No parse rules for ${a.getName }:" + + s"\n ${dumpTree(a).toString } ") } protected val escapedIdentifier = "`([^`]+)`".r @@ -1520,6 +1531,30 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C case ast: ASTNode if ast.getType == HiveParser.TOK_CHARSETLITERAL => Literal(BaseSemanticAnalyzer.charSetString(ast.getChild(0).getText, ast.getChild(1).getText)) + case ast: ASTNode if ast.getType == HiveParser.TOK_INTERVAL_YEAR_MONTH_LITERAL => + Literal(CalendarInterval.fromYearMonthString(ast.getText)) + + case ast: ASTNode if ast.getType == HiveParser.TOK_INTERVAL_DAY_TIME_LITERAL => + Literal(CalendarInterval.fromDayTimeString(ast.getText)) + + case ast: ASTNode if ast.getType == HiveParser.TOK_INTERVAL_YEAR_LITERAL => + Literal(CalendarInterval.fromSingleUnitString("year", ast.getText)) + + case ast: ASTNode if ast.getType == HiveParser.TOK_INTERVAL_MONTH_LITERAL => + Literal(CalendarInterval.fromSingleUnitString("month", ast.getText)) + + case ast: ASTNode if ast.getType == HiveParser.TOK_INTERVAL_DAY_LITERAL => + Literal(CalendarInterval.fromSingleUnitString("day", ast.getText)) + + case ast: ASTNode if ast.getType == HiveParser.TOK_INTERVAL_HOUR_LITERAL => + Literal(CalendarInterval.fromSingleUnitString("hour", ast.getText)) + + case ast: ASTNode if ast.getType == HiveParser.TOK_INTERVAL_MINUTE_LITERAL => + Literal(CalendarInterval.fromSingleUnitString("minute", ast.getText)) + + case ast: ASTNode if ast.getType == HiveParser.TOK_INTERVAL_SECOND_LITERAL => + Literal(CalendarInterval.fromSingleUnitString("second", ast.getText)) + case a: ASTNode => throw new NotImplementedError( s"""No parse rules for ASTNode type: ${a.getType}, text: ${a.getText} : diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala index a357bb39ca7f..267074f3ad10 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.hive import java.io.{InputStream, OutputStream} import java.rmi.server.UID +import org.apache.avro.Schema + /* Implicit conversions */ import scala.collection.JavaConversions._ import scala.language.implicitConversions @@ -33,7 +35,7 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.exec.{UDF, Utilities} import org.apache.hadoop.hive.ql.plan.{FileSinkDesc, TableDesc} import org.apache.hadoop.hive.serde2.ColumnProjectionUtils -import org.apache.hadoop.hive.serde2.avro.AvroGenericRecordWritable +import org.apache.hadoop.hive.serde2.avro.{AvroGenericRecordWritable, AvroSerdeUtils} import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector import org.apache.hadoop.io.Writable @@ -82,10 +84,19 @@ private[hive] object HiveShim { * Bug introduced in hive-0.13. AvroGenericRecordWritable has a member recordReaderID that * is needed to initialize before serialization. */ - def prepareWritable(w: Writable): Writable = { + def prepareWritable(w: Writable, serDeProps: Seq[(String, String)]): Writable = { w match { case w: AvroGenericRecordWritable => w.setRecordReaderID(new UID()) + // In Hive 1.1, the record's schema may need to be initialized manually or a NPE will + // be thrown. + if (w.getFileSchema() == null) { + serDeProps + .find(_._1 == AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName()) + .foreach { kv => + w.setFileSchema(new Schema.Parser().parse(kv._2)) + } + } case _ => } w diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala index cd6cd322c94e..d38ad9127327 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala @@ -83,14 +83,16 @@ private[hive] trait HiveStrategies { object HiveDDLStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case CreateTableUsing( - tableName, userSpecifiedSchema, provider, false, opts, allowExisting, managedIfNoPath) => - ExecutedCommand( + tableIdent, userSpecifiedSchema, provider, false, opts, allowExisting, managedIfNoPath) => + val cmd = CreateMetastoreDataSource( - tableName, userSpecifiedSchema, provider, opts, allowExisting, managedIfNoPath)) :: Nil + tableIdent, userSpecifiedSchema, provider, opts, allowExisting, managedIfNoPath) + ExecutedCommand(cmd) :: Nil - case CreateTableUsingAsSelect(tableName, provider, false, partitionCols, mode, opts, query) => + case CreateTableUsingAsSelect( + tableIdent, provider, false, partitionCols, mode, opts, query) => val cmd = - CreateMetastoreDataSourceAsSelect(tableName, provider, partitionCols, mode, opts, query) + CreateMetastoreDataSourceAsSelect(tableIdent, provider, partitionCols, mode, opts, query) ExecutedCommand(cmd) :: Nil case _ => Nil diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala index d834b4e83e04..3811c152a7ae 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala @@ -87,6 +87,13 @@ private[hive] case class HiveTable( * shared classes. */ private[hive] trait ClientInterface { + + /** Returns the Hive Version of this client. */ + def version: HiveVersion + + /** Returns the configuration for the given key in the current session. */ + def getConf(key: String, defaultValue: String): String + /** * Runs a HiveQL command using Hive, returning the results as a list of strings. Each row will * result in one string. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala index 6e0912da5862..f49c97de8ff4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala @@ -32,13 +32,14 @@ import org.apache.hadoop.hive.ql.metadata.Hive import org.apache.hadoop.hive.ql.processors._ import org.apache.hadoop.hive.ql.session.SessionState import org.apache.hadoop.hive.ql.{Driver, metadata} +import org.apache.hadoop.hive.shims.{HadoopShims, ShimLoader} +import org.apache.hadoop.util.VersionInfo import org.apache.spark.Logging import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.execution.QueryExecutionException import org.apache.spark.util.{CircularBuffer, Utils} - /** * A class that wraps the HiveClient and converts its responses to externally visible classes. * Note that this class is typically loaded with an internal classloader for each instantiation, @@ -57,12 +58,80 @@ import org.apache.spark.util.{CircularBuffer, Utils} * this ClientWrapper. */ private[hive] class ClientWrapper( - version: HiveVersion, + override val version: HiveVersion, config: Map[String, String], initClassLoader: ClassLoader) extends ClientInterface with Logging { + overrideHadoopShims() + + // !! HACK ALERT !! + // + // Internally, Hive `ShimLoader` tries to load different versions of Hadoop shims by checking + // major version number gathered from Hadoop jar files: + // + // - For major version number 1, load `Hadoop20SShims`, where "20S" stands for Hadoop 0.20 with + // security. + // - For major version number 2, load `Hadoop23Shims`, where "23" stands for Hadoop 0.23. + // + // However, APIs in Hadoop 2.0.x and 2.1.x versions were in flux due to historical reasons. It + // turns out that Hadoop 2.0.x versions should also be used together with `Hadoop20SShims`, but + // `Hadoop23Shims` is chosen because the major version number here is 2. + // + // To fix this issue, we try to inspect Hadoop version via `org.apache.hadoop.utils.VersionInfo` + // and load `Hadoop20SShims` for Hadoop 1.x and 2.0.x versions. If Hadoop version information is + // not available, we decide whether to override the shims or not by checking for existence of a + // probe method which doesn't exist in Hadoop 1.x or 2.0.x versions. + private def overrideHadoopShims(): Unit = { + val hadoopVersion = VersionInfo.getVersion + val VersionPattern = """(\d+)\.(\d+).*""".r + + hadoopVersion match { + case null => + logError("Failed to inspect Hadoop version") + + // Using "Path.getPathWithoutSchemeAndAuthority" as the probe method. + val probeMethod = "getPathWithoutSchemeAndAuthority" + if (!classOf[Path].getDeclaredMethods.exists(_.getName == probeMethod)) { + logInfo( + s"Method ${classOf[Path].getCanonicalName}.$probeMethod not found, " + + s"we are probably using Hadoop 1.x or 2.0.x") + loadHadoop20SShims() + } + + case VersionPattern(majorVersion, minorVersion) => + logInfo(s"Inspected Hadoop version: $hadoopVersion") + + // Loads Hadoop20SShims for 1.x and 2.0.x versions + val (major, minor) = (majorVersion.toInt, minorVersion.toInt) + if (major < 2 || (major == 2 && minor == 0)) { + loadHadoop20SShims() + } + } + + // Logs the actual loaded Hadoop shims class + val loadedShimsClassName = ShimLoader.getHadoopShims.getClass.getCanonicalName + logInfo(s"Loaded $loadedShimsClassName for Hadoop version $hadoopVersion") + } + + private def loadHadoop20SShims(): Unit = { + val hadoop20SShimsClassName = "org.apache.hadoop.hive.shims.Hadoop20SShims" + logInfo(s"Loading Hadoop shims $hadoop20SShimsClassName") + + try { + val shimsField = classOf[ShimLoader].getDeclaredField("hadoopShims") + // scalastyle:off classforname + val shimsClass = Class.forName(hadoop20SShimsClassName) + // scalastyle:on classforname + val shims = classOf[HadoopShims].cast(shimsClass.newInstance()) + shimsField.setAccessible(true) + shimsField.set(null, shims) + } catch { case cause: Throwable => + throw new RuntimeException(s"Failed to load $hadoop20SShimsClassName", cause) + } + } + // Circular buffer to hold what hive prints to STDOUT and ERR. Only printed when failures occur. private val outputBuffer = new CircularBuffer() @@ -115,6 +184,10 @@ private[hive] class ClientWrapper( /** Returns the configuration for the current session. */ def conf: HiveConf = SessionState.get().getConf + override def getConf(key: String, defaultValue: String): String = { + conf.get(key, defaultValue) + } + // TODO: should be a def?s // When we create this val client, the HiveConf of it (conf) is the one associated with state. @GuardedBy("this") diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 956997e5f9dc..8fc8935b1dc3 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -25,7 +25,7 @@ import java.util.concurrent.TimeUnit import scala.collection.JavaConversions._ -import org.apache.hadoop.fs.Path +import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table} @@ -429,7 +429,7 @@ private[client] class Shim_v0_14 extends Shim_v0_13 { isSkewedStoreAsSubdir: Boolean): Unit = { loadPartitionMethod.invoke(hive, loadPath, tableName, partSpec, replace: JBoolean, holdDDLTime: JBoolean, inheritTableSpecs: JBoolean, isSkewedStoreAsSubdir: JBoolean, - JBoolean.TRUE, JBoolean.FALSE) + isSrcLocal(loadPath, hive.getConf()): JBoolean, JBoolean.FALSE) } override def loadTable( @@ -439,7 +439,7 @@ private[client] class Shim_v0_14 extends Shim_v0_13 { replace: Boolean, holdDDLTime: Boolean): Unit = { loadTableMethod.invoke(hive, loadPath, tableName, replace: JBoolean, holdDDLTime: JBoolean, - JBoolean.TRUE, JBoolean.FALSE, JBoolean.FALSE) + isSrcLocal(loadPath, hive.getConf()): JBoolean, JBoolean.FALSE, JBoolean.FALSE) } override def loadDynamicPartitions( @@ -461,6 +461,13 @@ private[client] class Shim_v0_14 extends Shim_v0_13 { HiveConf.ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY, TimeUnit.MILLISECONDS).asInstanceOf[Long] } + + protected def isSrcLocal(path: Path, conf: HiveConf): Boolean = { + val localFs = FileSystem.getLocal(conf) + val pathFs = FileSystem.get(path.toUri(), conf) + localFs.getUri() == pathFs.getUri() + } + } private[client] class Shim_v1_0 extends Shim_v0_14 { @@ -512,7 +519,7 @@ private[client] class Shim_v1_2 extends Shim_v1_1 { listBucketingEnabled: Boolean): Unit = { loadDynamicPartitionsMethod.invoke(hive, loadPath, tableName, partSpec, replace: JBoolean, numDP: JInteger, holdDDLTime: JBoolean, listBucketingEnabled: JBoolean, JBoolean.FALSE, - 0: JLong) + 0L: JLong) } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index 97fb98199991..785603750841 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -42,11 +42,18 @@ private[hive] object IsolatedClientLoader { def forVersion( version: String, config: Map[String, String] = Map.empty, - ivyPath: Option[String] = None): IsolatedClientLoader = synchronized { + ivyPath: Option[String] = None, + sharedPrefixes: Seq[String] = Seq.empty, + barrierPrefixes: Seq[String] = Seq.empty): IsolatedClientLoader = synchronized { val resolvedVersion = hiveVersion(version) val files = resolvedVersions.getOrElseUpdate(resolvedVersion, downloadVersion(resolvedVersion, ivyPath)) - new IsolatedClientLoader(hiveVersion(version), files, config) + new IsolatedClientLoader( + version = hiveVersion(version), + execJars = files, + config = config, + sharedPrefixes = sharedPrefixes, + barrierPrefixes = barrierPrefixes) } def hiveVersion(version: String): HiveVersion = version match { @@ -55,7 +62,7 @@ private[hive] object IsolatedClientLoader { case "14" | "0.14" | "0.14.0" => hive.v14 case "1.0" | "1.0.0" => hive.v1_0 case "1.1" | "1.1.0" => hive.v1_1 - case "1.2" | "1.2.0" => hive.v1_2 + case "1.2" | "1.2.0" | "1.2.1" => hive.v1_2 } private def downloadVersion(version: HiveVersion, ivyPath: Option[String]): Seq[URL] = { @@ -77,7 +84,7 @@ private[hive] object IsolatedClientLoader { // TODO: Remove copy logic. val tempDir = Utils.createTempDir(namePrefix = s"hive-${version}") allFiles.foreach(f => FileUtils.copyFileToDirectory(f, tempDir)) - tempDir.listFiles().map(_.toURL) + tempDir.listFiles().map(_.toURI.toURL) } private def resolvedVersions = new scala.collection.mutable.HashMap[HiveVersion, Seq[URL]] diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala index b48082fe4b36..b1b8439efa01 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala @@ -25,7 +25,7 @@ package object client { val exclusions: Seq[String] = Nil) // scalastyle:off - private[client] object hive { + private[hive] object hive { case object v12 extends HiveVersion("0.12.0") case object v13 extends HiveVersion("0.13.1") @@ -56,7 +56,7 @@ package object client { "net.hydromatic:linq4j", "net.hydromatic:quidem")) - case object v1_2 extends HiveVersion("1.2.0", + case object v1_2 extends HiveVersion("1.2.1", exclusions = Seq("eigenbase:eigenbase-properties", "org.apache.curator:*", "org.pentaho:pentaho-aggdesigner-algorithm", diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala index 84358cb73c9e..8422287e177e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala @@ -40,6 +40,8 @@ case class CreateTableAsSelect( def database: String = tableDesc.database def tableName: String = tableDesc.name + override def children: Seq[LogicalPlan] = Seq(query) + override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] lazy val metastoreRelation: MetastoreRelation = { @@ -91,6 +93,6 @@ case class CreateTableAsSelect( } override def argString: String = { - s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]\n" + query.toString + s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]" } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 40a6a3215668..62efda613a17 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -61,7 +61,7 @@ case class InsertIntoHiveTable( serializer } - def output: Seq[Attribute] = child.output + def output: Seq[Attribute] = Seq.empty def saveAsHiveFile( rdd: RDD[InternalRow], @@ -129,7 +129,7 @@ case class InsertIntoHiveTable( // instances within the closure, since Serializer is not serializable while TableDesc is. val tableDesc = table.tableDesc val tableLocation = table.hiveQlTable.getDataLocation - val tmpLocation = hiveContext.getExternalTmpPath(tableLocation.toUri) + val tmpLocation = hiveContext.getExternalTmpPath(tableLocation) val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false) val isCompressed = sc.hiveconf.getBoolean( ConfVars.COMPRESSRESULT.varname, ConfVars.COMPRESSRESULT.defaultBoolVal) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala index 7e3342cc84c0..ade27454b9d2 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala @@ -27,11 +27,10 @@ import scala.util.control.NonFatal import org.apache.hadoop.hive.serde.serdeConstants import org.apache.hadoop.hive.serde2.AbstractSerDe import org.apache.hadoop.hive.serde2.objectinspector._ +import org.apache.hadoop.io.Writable -import org.apache.spark.{TaskContext, Logging} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.ScriptInputOutputSchema import org.apache.spark.sql.execution._ @@ -39,6 +38,7 @@ import org.apache.spark.sql.hive.HiveShim._ import org.apache.spark.sql.hive.{HiveContext, HiveInspectors} import org.apache.spark.sql.types.DataType import org.apache.spark.util.{CircularBuffer, RedirectThread, Utils} +import org.apache.spark.{Logging, TaskContext} /** * Transforms the input by forking and running the specified script. @@ -88,6 +88,7 @@ case class ScriptTransformation( // external process. That process's output will be read by this current thread. val writerThread = new ScriptTransformationWriterThread( inputIterator, + input.map(_.dataType), outputProjection, inputSerde, inputSoi, @@ -106,9 +107,15 @@ case class ScriptTransformation( val reader = new BufferedReader(new InputStreamReader(inputStream)) val outputIterator: Iterator[InternalRow] = new Iterator[InternalRow] with HiveInspectors { - var cacheRow: InternalRow = null var curLine: String = null - var eof: Boolean = false + val scriptOutputStream = new DataInputStream(inputStream) + var scriptOutputWritable: Writable = null + val reusedWritableObject: Writable = if (null != outputSerde) { + outputSerde.getSerializedClass().newInstance + } else { + null + } + val mutableRow = new SpecificMutableRow(output.map(_.dataType)) override def hasNext: Boolean = { if (outputSerde == null) { @@ -125,45 +132,20 @@ case class ScriptTransformation( } else { true } - } else { - if (eof) { - if (writerThread.exception.isDefined) { - throw writerThread.exception.get - } - false - } else { + } else if (scriptOutputWritable == null) { + scriptOutputWritable = reusedWritableObject + try { + scriptOutputWritable.readFields(scriptOutputStream) true + } catch { + case _: EOFException => + if (writerThread.exception.isDefined) { + throw writerThread.exception.get + } + false } - } - } - - def deserialize(): InternalRow = { - if (cacheRow != null) return cacheRow - - val mutableRow = new SpecificMutableRow(output.map(_.dataType)) - try { - val dataInputStream = new DataInputStream(inputStream) - val writable = outputSerde.getSerializedClass().newInstance - writable.readFields(dataInputStream) - - val raw = outputSerde.deserialize(writable) - val dataList = outputSoi.getStructFieldsDataAsList(raw) - val fieldList = outputSoi.getAllStructFieldRefs() - - var i = 0 - dataList.foreach( element => { - if (element == null) { - mutableRow.setNullAt(i) - } else { - mutableRow(i) = unwrap(element, fieldList(i).getFieldObjectInspector) - } - i += 1 - }) - mutableRow - } catch { - case e: EOFException => - eof = true - null + } else { + true } } @@ -171,7 +153,6 @@ case class ScriptTransformation( if (!hasNext) { throw new NoSuchElementException } - if (outputSerde == null) { val prevLine = curLine curLine = reader.readLine() @@ -185,12 +166,20 @@ case class ScriptTransformation( .map(CatalystTypeConverters.convertToCatalyst)) } } else { - val ret = deserialize() - if (!eof) { - cacheRow = null - cacheRow = deserialize() + val raw = outputSerde.deserialize(scriptOutputWritable) + scriptOutputWritable = null + val dataList = outputSoi.getStructFieldsDataAsList(raw) + val fieldList = outputSoi.getAllStructFieldRefs() + var i = 0 + while (i < dataList.size()) { + if (dataList(i) == null) { + mutableRow.setNullAt(i) + } else { + mutableRow(i) = unwrap(dataList(i), fieldList(i).getFieldObjectInspector) + } + i += 1 } - ret + mutableRow } } } @@ -213,6 +202,7 @@ case class ScriptTransformation( private class ScriptTransformationWriterThread( iter: Iterator[InternalRow], + inputSchema: Seq[DataType], outputProjection: Projection, @Nullable inputSerde: AbstractSerDe, @Nullable inputSoi: ObjectInspector, @@ -238,16 +228,29 @@ private class ScriptTransformationWriterThread( // We can't use Utils.tryWithSafeFinally here because we also need a `catch` block, so // let's use a variable to record whether the `finally` block was hit due to an exception var threwException: Boolean = true + val len = inputSchema.length try { iter.map(outputProjection).foreach { row => if (inputSerde == null) { - val data = row.mkString("", ioschema.inputRowFormatMap("TOK_TABLEROWFORMATFIELD"), - ioschema.inputRowFormatMap("TOK_TABLEROWFORMATLINES")).getBytes("utf-8") - outputStream.write(data) + val data = if (len == 0) { + ioschema.inputRowFormatMap("TOK_TABLEROWFORMATLINES") + } else { + val sb = new StringBuilder + sb.append(row.get(0, inputSchema(0))) + var i = 1 + while (i < len) { + sb.append(ioschema.inputRowFormatMap("TOK_TABLEROWFORMATFIELD")) + sb.append(row.get(i, inputSchema(i))) + i += 1 + } + sb.append(ioschema.inputRowFormatMap("TOK_TABLEROWFORMATLINES")) + sb.toString() + } + outputStream.write(data.getBytes("utf-8")) } else { val writable = inputSerde.serialize( row.asInstanceOf[GenericInternalRow].values, inputSoi) - prepareWritable(writable).write(dataOutputStream) + prepareWritable(writable, ioschema.outputSerdeProps).write(dataOutputStream) } } outputStream.close() @@ -320,18 +323,8 @@ case class HiveScriptIOSchema ( } private def parseAttrs(attrs: Seq[Expression]): (Seq[String], Seq[DataType]) = { - val columns = attrs.map { - case aref: AttributeReference => aref.name - case e: NamedExpression => e.name - case _ => null - } - - val columnTypes = attrs.map { - case aref: AttributeReference => aref.dataType - case e: NamedExpression => e.dataType - case _ => null - } - + val columns = attrs.zipWithIndex.map(e => s"${e._1.prettyName}_${e._2}") + val columnTypes = attrs.map(_.dataType) (columns, columnTypes) } @@ -345,9 +338,7 @@ case class HiveScriptIOSchema ( val columnTypesNames = columnTypes.map(_.toTypeInfo.getTypeName()).mkString(",") - var propsMap = serdeProps.map(kv => { - (kv._1.split("'")(1), kv._2.split("'")(1)) - }).toMap + (serdeConstants.LIST_COLUMNS -> columns.mkString(",")) + var propsMap = serdeProps.toMap + (serdeConstants.LIST_COLUMNS -> columns.mkString(",")) propsMap = propsMap + (serdeConstants.LIST_COLUMN_TYPES -> columnTypesNames) val properties = new Properties() diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala index a47f9a4feb21..d1699dd53681 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala @@ -17,7 +17,9 @@ package org.apache.spark.sql.hive.execution +import org.apache.hadoop.hive.metastore.MetaStoreUtils import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.{TableIdentifier, SqlParser} import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -88,7 +90,7 @@ case class AddJar(path: String) extends RunnableCommand { val currentClassLoader = Utils.getContextOrSparkClassLoader // Add jar to current context - val jarURL = new java.io.File(path).toURL + val jarURL = new java.io.File(path).toURI.toURL val newClassLoader = new java.net.URLClassLoader(Array(jarURL), currentClassLoader) Thread.currentThread.setContextClassLoader(newClassLoader) // We need to explicitly set the class loader associated with the conf in executionHive's @@ -120,9 +122,10 @@ case class AddFile(path: String) extends RunnableCommand { } } +// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104). private[hive] case class CreateMetastoreDataSource( - tableName: String, + tableIdent: TableIdentifier, userSpecifiedSchema: Option[StructType], provider: String, options: Map[String, String], @@ -130,9 +133,24 @@ case class CreateMetastoreDataSource( managedIfNoPath: Boolean) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { + // Since we are saving metadata to metastore, we need to check if metastore supports + // the table name and database name we have for this query. MetaStoreUtils.validateName + // is the method used by Hive to check if a table name or a database name is valid for + // the metastore. + if (!MetaStoreUtils.validateName(tableIdent.table)) { + throw new AnalysisException(s"Table name ${tableIdent.table} is not a valid name for " + + s"metastore. Metastore only accepts table name containing characters, numbers and _.") + } + if (tableIdent.database.isDefined && !MetaStoreUtils.validateName(tableIdent.database.get)) { + throw new AnalysisException(s"Database name ${tableIdent.database.get} is not a valid name " + + s"for metastore. Metastore only accepts database name containing " + + s"characters, numbers and _.") + } + + val tableName = tableIdent.unquotedString val hiveContext = sqlContext.asInstanceOf[HiveContext] - if (hiveContext.catalog.tableExists(tableName :: Nil)) { + if (hiveContext.catalog.tableExists(tableIdent.toSeq)) { if (allowExisting) { return Seq.empty[Row] } else { @@ -144,13 +162,13 @@ case class CreateMetastoreDataSource( val optionsWithPath = if (!options.contains("path") && managedIfNoPath) { isExternal = false - options + ("path" -> hiveContext.catalog.hiveDefaultTableFilePath(tableName)) + options + ("path" -> hiveContext.catalog.hiveDefaultTableFilePath(tableIdent)) } else { options } hiveContext.catalog.createDataSourceTable( - tableName, + tableIdent, userSpecifiedSchema, Array.empty[String], provider, @@ -161,9 +179,10 @@ case class CreateMetastoreDataSource( } } +// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104). private[hive] case class CreateMetastoreDataSourceAsSelect( - tableName: String, + tableIdent: TableIdentifier, provider: String, partitionColumns: Array[String], mode: SaveMode, @@ -171,19 +190,34 @@ case class CreateMetastoreDataSourceAsSelect( query: LogicalPlan) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { + // Since we are saving metadata to metastore, we need to check if metastore supports + // the table name and database name we have for this query. MetaStoreUtils.validateName + // is the method used by Hive to check if a table name or a database name is valid for + // the metastore. + if (!MetaStoreUtils.validateName(tableIdent.table)) { + throw new AnalysisException(s"Table name ${tableIdent.table} is not a valid name for " + + s"metastore. Metastore only accepts table name containing characters, numbers and _.") + } + if (tableIdent.database.isDefined && !MetaStoreUtils.validateName(tableIdent.database.get)) { + throw new AnalysisException(s"Database name ${tableIdent.database.get} is not a valid name " + + s"for metastore. Metastore only accepts database name containing " + + s"characters, numbers and _.") + } + + val tableName = tableIdent.unquotedString val hiveContext = sqlContext.asInstanceOf[HiveContext] var createMetastoreTable = false var isExternal = true val optionsWithPath = if (!options.contains("path")) { isExternal = false - options + ("path" -> hiveContext.catalog.hiveDefaultTableFilePath(tableName)) + options + ("path" -> hiveContext.catalog.hiveDefaultTableFilePath(tableIdent)) } else { options } var existingSchema = None: Option[StructType] - if (sqlContext.catalog.tableExists(Seq(tableName))) { + if (sqlContext.catalog.tableExists(tableIdent.toSeq)) { // Check if we need to throw an exception or just return. mode match { case SaveMode.ErrorIfExists => @@ -200,7 +234,7 @@ case class CreateMetastoreDataSourceAsSelect( val resolved = ResolvedDataSource( sqlContext, Some(query.schema.asNullable), partitionColumns, provider, optionsWithPath) val createdRelation = LogicalRelation(resolved.relation) - EliminateSubQueries(sqlContext.table(tableName).logicalPlan) match { + EliminateSubQueries(sqlContext.catalog.lookupRelation(tableIdent.toSeq)) match { case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation) => if (l.relation != createdRelation.relation) { val errorDescription = @@ -249,7 +283,7 @@ case class CreateMetastoreDataSourceAsSelect( // the schema of df). It is important since the nullability may be changed by the relation // provider (for example, see org.apache.spark.sql.parquet.DefaultSource). hiveContext.catalog.createDataSourceTable( - tableName, + tableIdent, Some(resolved.relation.schema), partitionColumns, provider, @@ -258,7 +292,7 @@ case class CreateMetastoreDataSourceAsSelect( } // Refresh the cache of the table in the catalog. - hiveContext.refreshTable(tableName) + hiveContext.catalog.refreshTable(tableIdent) Seq.empty[Row] } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala index abe5c6900313..7182246e466a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala @@ -133,8 +133,7 @@ private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, childre @transient private lazy val conversionHelper = new ConversionHelper(method, arguments) - @transient - lazy val dataType = javaClassToDataType(method.getReturnType) + val dataType = javaClassToDataType(method.getReturnType) @transient lazy val returnInspector = ObjectInspectorFactory.getReflectionObjectInspector( @@ -249,7 +248,7 @@ private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] { // Get the class of this function. // In Hive 0.12, there is no windowFunctionInfo.getFunctionClass. So, we use // windowFunctionInfo.getfInfo().getFunctionClass for both Hive 0.13 and Hive 0.13.1. - val functionClass = windowFunctionInfo.getfInfo().getFunctionClass + val functionClass = windowFunctionInfo.getFunctionClass() val newChildren = // Rank(), DENSE_RANK(), CUME_DIST(), and PERCENT_RANK() do not take explicit // input parameters and requires implicit parameters, which diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala index 8850e060d2a7..8dc796b056a7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala @@ -171,7 +171,7 @@ private[spark] class SparkHiveDynamicPartitionWriterContainer( import SparkHiveDynamicPartitionWriterContainer._ private val defaultPartName = jobConf.get( - ConfVars.DEFAULTPARTITIONNAME.varname, ConfVars.DEFAULTPARTITIONNAME.defaultVal) + ConfVars.DEFAULTPARTITIONNAME.varname, ConfVars.DEFAULTPARTITIONNAME.defaultStrVal) @transient private var writers: mutable.HashMap[String, FileSinkOperator.RecordWriter] = _ @@ -211,18 +211,18 @@ private[spark] class SparkHiveDynamicPartitionWriterContainer( } } - val dynamicPartPath = dynamicPartColNames - .zip(row.toSeq.takeRight(dynamicPartColNames.length)) - .map { case (col, rawVal) => - val string = if (rawVal == null) null else convertToHiveRawString(col, rawVal) - val colString = - if (string == null || string.isEmpty) { - defaultPartName - } else { - FileUtils.escapePathName(string, defaultPartName) - } - s"/$col=$colString" - }.mkString + val nonDynamicPartLen = row.numFields - dynamicPartColNames.length + val dynamicPartPath = dynamicPartColNames.zipWithIndex.map { case (colName, i) => + val rawVal = row.get(nonDynamicPartLen + i, schema(colName).dataType) + val string = if (rawVal == null) null else convertToHiveRawString(colName, rawVal) + val colString = + if (string == null || string.isEmpty) { + defaultPartName + } else { + FileUtils.escapePathName(string, defaultPartName) + } + s"/$colName=$colString" + }.mkString def newWriter(): FileSinkOperator.RecordWriter = { val newFileSinkDesc = new FileSinkDesc( diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala index ddd5d24717ad..86142e5d66f3 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.hive.orc import org.apache.hadoop.hive.common.`type`.{HiveChar, HiveDecimal, HiveVarchar} -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument +import org.apache.hadoop.hive.ql.io.sarg.{SearchArgumentFactory, SearchArgument} import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder import org.apache.hadoop.hive.serde2.io.DateWritable @@ -33,13 +33,13 @@ import org.apache.spark.sql.sources._ private[orc] object OrcFilters extends Logging { def createFilter(expr: Array[Filter]): Option[SearchArgument] = { expr.reduceOption(And).flatMap { conjunction => - val builder = SearchArgument.FACTORY.newBuilder() + val builder = SearchArgumentFactory.newBuilder() buildSearchArgument(conjunction, builder).map(_.build()) } } private def buildSearchArgument(expression: Filter, builder: Builder): Option[Builder] = { - def newBuilder = SearchArgument.FACTORY.newBuilder() + def newBuilder = SearchArgumentFactory.newBuilder() def isSearchableLiteral(value: Any): Boolean = value match { // These are types recognized by the `SearchArgumentImpl.BuilderImpl.boxLiteral()` method. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala index 6fa599734892..2d595189318a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala @@ -32,7 +32,7 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.Logging -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.mapred.SparkHadoopMapRedUtil import org.apache.spark.rdd.{HadoopRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow @@ -47,8 +47,11 @@ import org.apache.spark.util.SerializableConfiguration /* Implicit conversions */ import scala.collection.JavaConversions._ -private[sql] class DefaultSource extends HadoopFsRelationProvider { - def createRelation( +private[sql] class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister { + + override def shortName(): String = "orc" + + override def createRelation( sqlContext: SQLContext, paths: Array[String], dataSchema: Option[StructType], @@ -66,7 +69,7 @@ private[orc] class OrcOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext) - extends OutputWriterInternal with SparkHadoopMapRedUtil with HiveInspectors { + extends OutputWriter with SparkHadoopMapRedUtil with HiveInspectors { private val serializer = { val table = new Properties() @@ -76,7 +79,8 @@ private[orc] class OrcOutputWriter( }.mkString(":")) val serde = new OrcSerde - serde.initialize(context.getConfiguration, table) + val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context) + serde.initialize(configuration, table) serde } @@ -107,9 +111,10 @@ private[orc] class OrcOutputWriter( private lazy val recordWriter: RecordWriter[NullWritable, Writable] = { recordWriterInstantiated = true - val conf = context.getConfiguration + val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(context) val uniqueWriteJobId = conf.get("spark.sql.sources.writeJobUUID") - val partition = context.getTaskAttemptID.getTaskID.getId + val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context) + val partition = taskAttemptId.getTaskID.getId val filename = f"part-r-$partition%05d-$uniqueWriteJobId.orc" new OrcOutputFormat().getRecordWriter( @@ -120,7 +125,9 @@ private[orc] class OrcOutputWriter( ).asInstanceOf[RecordWriter[NullWritable, Writable]] } - override def writeInternal(row: InternalRow): Unit = { + override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal") + + override protected[sql] def writeInternal(row: InternalRow): Unit = { var i = 0 while (i < row.numFields) { reusableOutputBuffer(i) = wrappers(i)(row.get(i, dataSchema(i).dataType)) @@ -139,7 +146,6 @@ private[orc] class OrcOutputWriter( } } -@DeveloperApi private[sql] class OrcRelation( override val paths: Array[String], maybeDataSchema: Option[StructType], @@ -291,9 +297,11 @@ private[orc] case class OrcTableScan( // Sets requested columns addColumnIds(attributes, relation, conf) - if (inputPaths.nonEmpty) { - FileInputFormat.setInputPaths(job, inputPaths.map(_.getPath): _*) + if (inputPaths.isEmpty) { + // the input path probably be pruned, return an empty RDD. + return sqlContext.sparkContext.emptyRDD[InternalRow] } + FileInputFormat.setInputPaths(job, inputPaths.map(_.getPath): _*) val inputFormatClass = classOf[OrcInputFormat] diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala index 7bbdef90cd6b..ac9db6bf2695 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -20,29 +20,23 @@ package org.apache.spark.sql.hive.test import java.io.File import java.util.{Set => JavaSet} -import org.apache.hadoop.hive.conf.HiveConf +import scala.collection.mutable +import scala.language.implicitConversions + import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hadoop.hive.ql.exec.FunctionRegistry -import org.apache.hadoop.hive.ql.io.avro.{AvroContainerInputFormat, AvroContainerOutputFormat} -import org.apache.hadoop.hive.ql.metadata.Table -import org.apache.hadoop.hive.ql.parse.VariableSubstitution import org.apache.hadoop.hive.ql.processors._ import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe -import org.apache.hadoop.hive.serde2.avro.AvroSerDe -import org.apache.spark.sql.catalyst.CatalystConf +import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.CacheTableCommand import org.apache.spark.sql.hive._ import org.apache.spark.sql.hive.execution.HiveNativeCommand -import org.apache.spark.sql.SQLConf -import org.apache.spark.util.Utils +import org.apache.spark.util.{ShutdownHookManager, Utils} import org.apache.spark.{SparkConf, SparkContext} -import scala.collection.mutable -import scala.language.implicitConversions - /* Implicit conversions */ import scala.collection.JavaConversions._ @@ -56,7 +50,6 @@ object TestHive .set("spark.sql.test", "") .set("spark.sql.hive.metastore.barrierPrefixes", "org.apache.spark.sql.hive.execution.PairSerDe") - .set("spark.buffer.pageSize", "4m") // SPARK-8910 .set("spark.ui.enabled", "false"))) @@ -83,15 +76,25 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { hiveconf.set("hive.plan.serialization.format", "javaXML") - lazy val warehousePath = Utils.createTempDir() + lazy val warehousePath = Utils.createTempDir(namePrefix = "warehouse-") + + lazy val scratchDirPath = { + val dir = Utils.createTempDir(namePrefix = "scratch-") + dir.delete() + dir + } private lazy val temporaryConfig = newTemporaryConfiguration() /** Sets up the system initially or after a RESET command */ - protected override def configure(): Map[String, String] = - temporaryConfig ++ Map( - ConfVars.METASTOREWAREHOUSE.varname -> warehousePath.toString, - ConfVars.METASTORE_INTEGER_JDO_PUSHDOWN.varname -> "true") + protected override def configure(): Map[String, String] = { + super.configure() ++ temporaryConfig ++ Map( + ConfVars.METASTOREWAREHOUSE.varname -> warehousePath.toURI.toString, + ConfVars.METASTORE_INTEGER_JDO_PUSHDOWN.varname -> "true", + ConfVars.SCRATCHDIR.varname -> scratchDirPath.toURI.toString, + ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY.varname -> "1" + ) + } val testTempDir = Utils.createTempDir() @@ -149,7 +152,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { val hiveFilesTemp = File.createTempFile("catalystHiveFiles", "") hiveFilesTemp.delete() hiveFilesTemp.mkdir() - Utils.registerShutdownDeleteDir(hiveFilesTemp) + ShutdownHookManager.registerShutdownDeleteDir(hiveFilesTemp) val inRepoTests = if (System.getProperty("user.dir").endsWith("sql" + File.separator + "hive")) { new File("src" + File.separator + "test" + File.separator + "resources" + File.separator) @@ -244,7 +247,6 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { }), TestTable("src_thrift", () => { import org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer - import org.apache.hadoop.hive.serde2.thrift.test.Complex import org.apache.hadoop.mapred.{SequenceFileInputFormat, SequenceFileOutputFormat} import org.apache.thrift.protocol.TBinaryProtocol @@ -253,7 +255,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { |CREATE TABLE src_thrift(fake INT) |ROW FORMAT SERDE '${classOf[ThriftDeserializer].getName}' |WITH SERDEPROPERTIES( - | 'serialization.class'='${classOf[Complex].getName}', + | 'serialization.class'='org.apache.spark.sql.hive.test.Complex', | 'serialization.format'='${classOf[TBinaryProtocol].getName}' |) |STORED AS @@ -272,10 +274,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { "INSERT OVERWRITE TABLE serdeins SELECT * FROM src".cmd), TestTable("episodes", s"""CREATE TABLE episodes (title STRING, air_date STRING, doctor INT) - |ROW FORMAT SERDE '${classOf[AvroSerDe].getCanonicalName}' - |STORED AS - |INPUTFORMAT '${classOf[AvroContainerInputFormat].getCanonicalName}' - |OUTPUTFORMAT '${classOf[AvroContainerOutputFormat].getCanonicalName}' + |STORED AS avro |TBLPROPERTIES ( | 'avro.schema.literal'='{ | "type": "record", @@ -308,10 +307,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { TestTable("episodes_part", s"""CREATE TABLE episodes_part (title STRING, air_date STRING, doctor INT) |PARTITIONED BY (doctor_pt INT) - |ROW FORMAT SERDE '${classOf[AvroSerDe].getCanonicalName}' - |STORED AS - |INPUTFORMAT '${classOf[AvroContainerInputFormat].getCanonicalName}' - |OUTPUTFORMAT '${classOf[AvroContainerOutputFormat].getCanonicalName}' + |STORED AS avro |TBLPROPERTIES ( | 'avro.schema.literal'='{ | "type": "record", @@ -369,7 +365,11 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { INSERT OVERWRITE TABLE episodes_part PARTITION (doctor_pt=1) SELECT title, air_date, doctor FROM episodes """.cmd - ) + ), + TestTable("src_json", + s"""CREATE TABLE src_json (json STRING) STORED AS TEXTFILE + """.stripMargin.cmd, + s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/json.txt")}' INTO TABLE src_json".cmd) ) hiveQTestUtilTables.foreach(registerTestTable) @@ -437,6 +437,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { case (k, v) => metadataHive.runSqlHive(s"SET $k=$v") } + defaultOverrides() runSqlHive("USE default") diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java similarity index 68% rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java rename to sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java index 741a3cd31c60..019d8a30266e 100644 --- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package test.org.apache.spark.sql.hive; +package org.apache.spark.sql.hive; import java.io.IOException; import java.util.ArrayList; @@ -29,8 +29,10 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.*; import org.apache.spark.sql.expressions.Window; -import org.apache.spark.sql.hive.HiveContext; +import org.apache.spark.sql.expressions.UserDefinedAggregateFunction; +import static org.apache.spark.sql.functions.*; import org.apache.spark.sql.hive.test.TestHive$; +import org.apache.spark.sql.hive.aggregate.MyDoubleSum; public class JavaDataFrameSuite { private transient JavaSparkContext sc; @@ -54,7 +56,7 @@ public void setUp() throws IOException { for (int i = 0; i < 10; i++) { jsonObjects.add("{\"key\":" + i + ", \"value\":\"str" + i + "\"}"); } - df = hc.jsonRDD(sc.parallelize(jsonObjects)); + df = hc.read().json(sc.parallelize(jsonObjects)); df.registerTempTable("window_table"); } @@ -77,4 +79,26 @@ public void saveTableAndQueryIt() { " ROWS BETWEEN 1 preceding and 1 following) " + "FROM window_table").collectAsList()); } + + @Test + public void testUDAF() { + DataFrame df = hc.range(0, 100).unionAll(hc.range(0, 100)).select(col("id").as("value")); + UserDefinedAggregateFunction udaf = new MyDoubleSum(); + UserDefinedAggregateFunction registeredUDAF = hc.udf().register("mydoublesum", udaf); + // Create Columns for the UDAF. For now, callUDF does not take an argument to specific if + // we want to use distinct aggregation. + DataFrame aggregatedDF = + df.groupBy() + .agg( + udaf.distinct(col("value")), + udaf.apply(col("value")), + registeredUDAF.apply(col("value")), + callUDF("mydoublesum", col("value"))); + + List expectedResult = new ArrayList(); + expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0)); + checkAnswer( + aggregatedDF, + expectedResult); + } } diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java similarity index 96% rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java rename to sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java index 64d1ce92931e..4192155975c4 100644 --- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package test.org.apache.spark.sql.hive; +package org.apache.spark.sql.hive; import java.io.File; import java.io.IOException; @@ -37,7 +37,6 @@ import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.QueryTest$; import org.apache.spark.sql.Row; -import org.apache.spark.sql.hive.HiveContext; import org.apache.spark.sql.hive.test.TestHive$; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; @@ -90,8 +89,10 @@ public void setUp() throws IOException { @After public void tearDown() throws IOException { // Clean up tables. - sqlContext.sql("DROP TABLE IF EXISTS javaSavedTable"); - sqlContext.sql("DROP TABLE IF EXISTS externalTable"); + if (sqlContext != null) { + sqlContext.sql("DROP TABLE IF EXISTS javaSavedTable"); + sqlContext.sql("DROP TABLE IF EXISTS externalTable"); + } } @Test diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java similarity index 98% rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java rename to sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java index a2247e3da155..5a167edd8959 100644 --- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package test.org.apache.spark.sql.hive.aggregate; +package org.apache.spark.sql.hive.aggregate; import java.util.ArrayList; import java.util.List; @@ -65,7 +65,7 @@ public MyDoubleAvg() { return _bufferSchema; } - @Override public DataType returnDataType() { + @Override public DataType dataType() { return _returnDataType; } diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java similarity index 97% rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java rename to sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java index da29e24d267d..c3b7768e71bf 100644 --- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package test.org.apache.spark.sql.hive.aggregate; +package org.apache.spark.sql.hive.aggregate; import java.util.ArrayList; import java.util.List; @@ -60,7 +60,7 @@ public MyDoubleSum() { return _bufferSchema; } - @Override public DataType returnDataType() { + @Override public DataType dataType() { return _returnDataType; } diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFIntegerToString.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFIntegerToString.java similarity index 100% rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFIntegerToString.java rename to sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFIntegerToString.java diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListListInt.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListListInt.java similarity index 100% rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListListInt.java rename to sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListListInt.java diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListString.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListString.java similarity index 100% rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListString.java rename to sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListString.java diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFStringString.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFStringString.java similarity index 100% rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFStringString.java rename to sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFStringString.java diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFTwoListList.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFTwoListList.java similarity index 100% rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFTwoListList.java rename to sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFTwoListList.java diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/test/Complex.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/test/Complex.java new file mode 100644 index 000000000000..e010112bb932 --- /dev/null +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/test/Complex.java @@ -0,0 +1,1139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.hive.test; + +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.apache.thrift.scheme.IScheme; +import org.apache.thrift.scheme.SchemeFactory; +import org.apache.thrift.scheme.StandardScheme; + +import org.apache.hadoop.hive.serde2.thrift.test.IntString; +import org.apache.thrift.scheme.TupleScheme; +import org.apache.thrift.protocol.TTupleProtocol; +import org.apache.thrift.EncodingUtils; +import java.util.List; +import java.util.ArrayList; +import java.util.Map; +import java.util.HashMap; +import java.util.EnumMap; +import java.util.EnumSet; +import java.util.Collections; +import java.util.BitSet; + +/** + * This is a fork of Hive 0.13's org/apache/hadoop/hive/serde2/thrift/test/Complex.java, which + * does not contain union fields that are not supported by Spark SQL. + */ + +@SuppressWarnings({"ALL", "unchecked"}) +public class Complex implements org.apache.thrift.TBase, java.io.Serializable, Cloneable { + private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("Complex"); + + private static final org.apache.thrift.protocol.TField AINT_FIELD_DESC = new org.apache.thrift.protocol.TField("aint", org.apache.thrift.protocol.TType.I32, (short)1); + private static final org.apache.thrift.protocol.TField A_STRING_FIELD_DESC = new org.apache.thrift.protocol.TField("aString", org.apache.thrift.protocol.TType.STRING, (short)2); + private static final org.apache.thrift.protocol.TField LINT_FIELD_DESC = new org.apache.thrift.protocol.TField("lint", org.apache.thrift.protocol.TType.LIST, (short)3); + private static final org.apache.thrift.protocol.TField L_STRING_FIELD_DESC = new org.apache.thrift.protocol.TField("lString", org.apache.thrift.protocol.TType.LIST, (short)4); + private static final org.apache.thrift.protocol.TField LINT_STRING_FIELD_DESC = new org.apache.thrift.protocol.TField("lintString", org.apache.thrift.protocol.TType.LIST, (short)5); + private static final org.apache.thrift.protocol.TField M_STRING_STRING_FIELD_DESC = new org.apache.thrift.protocol.TField("mStringString", org.apache.thrift.protocol.TType.MAP, (short)6); + + private static final Map, SchemeFactory> schemes = new HashMap, SchemeFactory>(); + static { + schemes.put(StandardScheme.class, new ComplexStandardSchemeFactory()); + schemes.put(TupleScheme.class, new ComplexTupleSchemeFactory()); + } + + private int aint; // required + private String aString; // required + private List lint; // required + private List lString; // required + private List lintString; // required + private Map mStringString; // required + + /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */ + public enum _Fields implements org.apache.thrift.TFieldIdEnum { + AINT((short)1, "aint"), + A_STRING((short)2, "aString"), + LINT((short)3, "lint"), + L_STRING((short)4, "lString"), + LINT_STRING((short)5, "lintString"), + M_STRING_STRING((short)6, "mStringString"); + + private static final Map byName = new HashMap(); + + static { + for (_Fields field : EnumSet.allOf(_Fields.class)) { + byName.put(field.getFieldName(), field); + } + } + + /** + * Find the _Fields constant that matches fieldId, or null if its not found. + */ + public static _Fields findByThriftId(int fieldId) { + switch(fieldId) { + case 1: // AINT + return AINT; + case 2: // A_STRING + return A_STRING; + case 3: // LINT + return LINT; + case 4: // L_STRING + return L_STRING; + case 5: // LINT_STRING + return LINT_STRING; + case 6: // M_STRING_STRING + return M_STRING_STRING; + default: + return null; + } + } + + /** + * Find the _Fields constant that matches fieldId, throwing an exception + * if it is not found. + */ + public static _Fields findByThriftIdOrThrow(int fieldId) { + _Fields fields = findByThriftId(fieldId); + if (fields == null) throw new IllegalArgumentException("Field " + fieldId + " doesn't exist!"); + return fields; + } + + /** + * Find the _Fields constant that matches name, or null if its not found. + */ + public static _Fields findByName(String name) { + return byName.get(name); + } + + private final short _thriftId; + private final String _fieldName; + + _Fields(short thriftId, String fieldName) { + _thriftId = thriftId; + _fieldName = fieldName; + } + + public short getThriftFieldId() { + return _thriftId; + } + + public String getFieldName() { + return _fieldName; + } + } + + // isset id assignments + private static final int __AINT_ISSET_ID = 0; + private byte __isset_bitfield = 0; + public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap; + static { + Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class); + tmpMap.put(_Fields.AINT, new org.apache.thrift.meta_data.FieldMetaData("aint", org.apache.thrift.TFieldRequirementType.DEFAULT, + new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32))); + tmpMap.put(_Fields.A_STRING, new org.apache.thrift.meta_data.FieldMetaData("aString", org.apache.thrift.TFieldRequirementType.DEFAULT, + new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING))); + tmpMap.put(_Fields.LINT, new org.apache.thrift.meta_data.FieldMetaData("lint", org.apache.thrift.TFieldRequirementType.DEFAULT, + new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, + new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)))); + tmpMap.put(_Fields.L_STRING, new org.apache.thrift.meta_data.FieldMetaData("lString", org.apache.thrift.TFieldRequirementType.DEFAULT, + new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, + new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)))); + tmpMap.put(_Fields.LINT_STRING, new org.apache.thrift.meta_data.FieldMetaData("lintString", org.apache.thrift.TFieldRequirementType.DEFAULT, + new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, + new org.apache.thrift.meta_data.StructMetaData(org.apache.thrift.protocol.TType.STRUCT, IntString.class)))); + tmpMap.put(_Fields.M_STRING_STRING, new org.apache.thrift.meta_data.FieldMetaData("mStringString", org.apache.thrift.TFieldRequirementType.DEFAULT, + new org.apache.thrift.meta_data.MapMetaData(org.apache.thrift.protocol.TType.MAP, + new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING), + new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)))); + metaDataMap = Collections.unmodifiableMap(tmpMap); + org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(Complex.class, metaDataMap); + } + + public Complex() { + } + + public Complex( + int aint, + String aString, + List lint, + List lString, + List lintString, + Map mStringString) + { + this(); + this.aint = aint; + setAintIsSet(true); + this.aString = aString; + this.lint = lint; + this.lString = lString; + this.lintString = lintString; + this.mStringString = mStringString; + } + + /** + * Performs a deep copy on other. + */ + public Complex(Complex other) { + __isset_bitfield = other.__isset_bitfield; + this.aint = other.aint; + if (other.isSetAString()) { + this.aString = other.aString; + } + if (other.isSetLint()) { + List __this__lint = new ArrayList(); + for (Integer other_element : other.lint) { + __this__lint.add(other_element); + } + this.lint = __this__lint; + } + if (other.isSetLString()) { + List __this__lString = new ArrayList(); + for (String other_element : other.lString) { + __this__lString.add(other_element); + } + this.lString = __this__lString; + } + if (other.isSetLintString()) { + List __this__lintString = new ArrayList(); + for (IntString other_element : other.lintString) { + __this__lintString.add(new IntString(other_element)); + } + this.lintString = __this__lintString; + } + if (other.isSetMStringString()) { + Map __this__mStringString = new HashMap(); + for (Map.Entry other_element : other.mStringString.entrySet()) { + + String other_element_key = other_element.getKey(); + String other_element_value = other_element.getValue(); + + String __this__mStringString_copy_key = other_element_key; + + String __this__mStringString_copy_value = other_element_value; + + __this__mStringString.put(__this__mStringString_copy_key, __this__mStringString_copy_value); + } + this.mStringString = __this__mStringString; + } + } + + public Complex deepCopy() { + return new Complex(this); + } + + @Override + public void clear() { + setAintIsSet(false); + this.aint = 0; + this.aString = null; + this.lint = null; + this.lString = null; + this.lintString = null; + this.mStringString = null; + } + + public int getAint() { + return this.aint; + } + + public void setAint(int aint) { + this.aint = aint; + setAintIsSet(true); + } + + public void unsetAint() { + __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __AINT_ISSET_ID); + } + + /** Returns true if field aint is set (has been assigned a value) and false otherwise */ + public boolean isSetAint() { + return EncodingUtils.testBit(__isset_bitfield, __AINT_ISSET_ID); + } + + public void setAintIsSet(boolean value) { + __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __AINT_ISSET_ID, value); + } + + public String getAString() { + return this.aString; + } + + public void setAString(String aString) { + this.aString = aString; + } + + public void unsetAString() { + this.aString = null; + } + + /** Returns true if field aString is set (has been assigned a value) and false otherwise */ + public boolean isSetAString() { + return this.aString != null; + } + + public void setAStringIsSet(boolean value) { + if (!value) { + this.aString = null; + } + } + + public int getLintSize() { + return (this.lint == null) ? 0 : this.lint.size(); + } + + public java.util.Iterator getLintIterator() { + return (this.lint == null) ? null : this.lint.iterator(); + } + + public void addToLint(int elem) { + if (this.lint == null) { + this.lint = new ArrayList<>(); + } + this.lint.add(elem); + } + + public List getLint() { + return this.lint; + } + + public void setLint(List lint) { + this.lint = lint; + } + + public void unsetLint() { + this.lint = null; + } + + /** Returns true if field lint is set (has been assigned a value) and false otherwise */ + public boolean isSetLint() { + return this.lint != null; + } + + public void setLintIsSet(boolean value) { + if (!value) { + this.lint = null; + } + } + + public int getLStringSize() { + return (this.lString == null) ? 0 : this.lString.size(); + } + + public java.util.Iterator getLStringIterator() { + return (this.lString == null) ? null : this.lString.iterator(); + } + + public void addToLString(String elem) { + if (this.lString == null) { + this.lString = new ArrayList(); + } + this.lString.add(elem); + } + + public List getLString() { + return this.lString; + } + + public void setLString(List lString) { + this.lString = lString; + } + + public void unsetLString() { + this.lString = null; + } + + /** Returns true if field lString is set (has been assigned a value) and false otherwise */ + public boolean isSetLString() { + return this.lString != null; + } + + public void setLStringIsSet(boolean value) { + if (!value) { + this.lString = null; + } + } + + public int getLintStringSize() { + return (this.lintString == null) ? 0 : this.lintString.size(); + } + + public java.util.Iterator getLintStringIterator() { + return (this.lintString == null) ? null : this.lintString.iterator(); + } + + public void addToLintString(IntString elem) { + if (this.lintString == null) { + this.lintString = new ArrayList<>(); + } + this.lintString.add(elem); + } + + public List getLintString() { + return this.lintString; + } + + public void setLintString(List lintString) { + this.lintString = lintString; + } + + public void unsetLintString() { + this.lintString = null; + } + + /** Returns true if field lintString is set (has been assigned a value) and false otherwise */ + public boolean isSetLintString() { + return this.lintString != null; + } + + public void setLintStringIsSet(boolean value) { + if (!value) { + this.lintString = null; + } + } + + public int getMStringStringSize() { + return (this.mStringString == null) ? 0 : this.mStringString.size(); + } + + public void putToMStringString(String key, String val) { + if (this.mStringString == null) { + this.mStringString = new HashMap(); + } + this.mStringString.put(key, val); + } + + public Map getMStringString() { + return this.mStringString; + } + + public void setMStringString(Map mStringString) { + this.mStringString = mStringString; + } + + public void unsetMStringString() { + this.mStringString = null; + } + + /** Returns true if field mStringString is set (has been assigned a value) and false otherwise */ + public boolean isSetMStringString() { + return this.mStringString != null; + } + + public void setMStringStringIsSet(boolean value) { + if (!value) { + this.mStringString = null; + } + } + + public void setFieldValue(_Fields field, Object value) { + switch (field) { + case AINT: + if (value == null) { + unsetAint(); + } else { + setAint((Integer)value); + } + break; + + case A_STRING: + if (value == null) { + unsetAString(); + } else { + setAString((String)value); + } + break; + + case LINT: + if (value == null) { + unsetLint(); + } else { + setLint((List)value); + } + break; + + case L_STRING: + if (value == null) { + unsetLString(); + } else { + setLString((List)value); + } + break; + + case LINT_STRING: + if (value == null) { + unsetLintString(); + } else { + setLintString((List)value); + } + break; + + case M_STRING_STRING: + if (value == null) { + unsetMStringString(); + } else { + setMStringString((Map)value); + } + break; + + } + } + + public Object getFieldValue(_Fields field) { + switch (field) { + case AINT: + return Integer.valueOf(getAint()); + + case A_STRING: + return getAString(); + + case LINT: + return getLint(); + + case L_STRING: + return getLString(); + + case LINT_STRING: + return getLintString(); + + case M_STRING_STRING: + return getMStringString(); + + } + throw new IllegalStateException(); + } + + /** Returns true if field corresponding to fieldID is set (has been assigned a value) and false otherwise */ + public boolean isSet(_Fields field) { + if (field == null) { + throw new IllegalArgumentException(); + } + + switch (field) { + case AINT: + return isSetAint(); + case A_STRING: + return isSetAString(); + case LINT: + return isSetLint(); + case L_STRING: + return isSetLString(); + case LINT_STRING: + return isSetLintString(); + case M_STRING_STRING: + return isSetMStringString(); + } + throw new IllegalStateException(); + } + + @Override + public boolean equals(Object that) { + if (that == null) + return false; + if (that instanceof Complex) + return this.equals((Complex)that); + return false; + } + + public boolean equals(Complex that) { + if (that == null) + return false; + + boolean this_present_aint = true; + boolean that_present_aint = true; + if (this_present_aint || that_present_aint) { + if (!(this_present_aint && that_present_aint)) + return false; + if (this.aint != that.aint) + return false; + } + + boolean this_present_aString = true && this.isSetAString(); + boolean that_present_aString = true && that.isSetAString(); + if (this_present_aString || that_present_aString) { + if (!(this_present_aString && that_present_aString)) + return false; + if (!this.aString.equals(that.aString)) + return false; + } + + boolean this_present_lint = true && this.isSetLint(); + boolean that_present_lint = true && that.isSetLint(); + if (this_present_lint || that_present_lint) { + if (!(this_present_lint && that_present_lint)) + return false; + if (!this.lint.equals(that.lint)) + return false; + } + + boolean this_present_lString = true && this.isSetLString(); + boolean that_present_lString = true && that.isSetLString(); + if (this_present_lString || that_present_lString) { + if (!(this_present_lString && that_present_lString)) + return false; + if (!this.lString.equals(that.lString)) + return false; + } + + boolean this_present_lintString = true && this.isSetLintString(); + boolean that_present_lintString = true && that.isSetLintString(); + if (this_present_lintString || that_present_lintString) { + if (!(this_present_lintString && that_present_lintString)) + return false; + if (!this.lintString.equals(that.lintString)) + return false; + } + + boolean this_present_mStringString = true && this.isSetMStringString(); + boolean that_present_mStringString = true && that.isSetMStringString(); + if (this_present_mStringString || that_present_mStringString) { + if (!(this_present_mStringString && that_present_mStringString)) + return false; + if (!this.mStringString.equals(that.mStringString)) + return false; + } + + return true; + } + + @Override + public int hashCode() { + HashCodeBuilder builder = new HashCodeBuilder(); + + boolean present_aint = true; + builder.append(present_aint); + if (present_aint) + builder.append(aint); + + boolean present_aString = true && (isSetAString()); + builder.append(present_aString); + if (present_aString) + builder.append(aString); + + boolean present_lint = true && (isSetLint()); + builder.append(present_lint); + if (present_lint) + builder.append(lint); + + boolean present_lString = true && (isSetLString()); + builder.append(present_lString); + if (present_lString) + builder.append(lString); + + boolean present_lintString = true && (isSetLintString()); + builder.append(present_lintString); + if (present_lintString) + builder.append(lintString); + + boolean present_mStringString = true && (isSetMStringString()); + builder.append(present_mStringString); + if (present_mStringString) + builder.append(mStringString); + + return builder.toHashCode(); + } + + public int compareTo(Complex other) { + if (!getClass().equals(other.getClass())) { + return getClass().getName().compareTo(other.getClass().getName()); + } + + int lastComparison = 0; + Complex typedOther = (Complex)other; + + lastComparison = Boolean.valueOf(isSetAint()).compareTo(typedOther.isSetAint()); + if (lastComparison != 0) { + return lastComparison; + } + if (isSetAint()) { + lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.aint, typedOther.aint); + if (lastComparison != 0) { + return lastComparison; + } + } + lastComparison = Boolean.valueOf(isSetAString()).compareTo(typedOther.isSetAString()); + if (lastComparison != 0) { + return lastComparison; + } + if (isSetAString()) { + lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.aString, typedOther.aString); + if (lastComparison != 0) { + return lastComparison; + } + } + lastComparison = Boolean.valueOf(isSetLint()).compareTo(typedOther.isSetLint()); + if (lastComparison != 0) { + return lastComparison; + } + if (isSetLint()) { + lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.lint, typedOther.lint); + if (lastComparison != 0) { + return lastComparison; + } + } + lastComparison = Boolean.valueOf(isSetLString()).compareTo(typedOther.isSetLString()); + if (lastComparison != 0) { + return lastComparison; + } + if (isSetLString()) { + lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.lString, typedOther.lString); + if (lastComparison != 0) { + return lastComparison; + } + } + lastComparison = Boolean.valueOf(isSetLintString()).compareTo(typedOther.isSetLintString()); + if (lastComparison != 0) { + return lastComparison; + } + if (isSetLintString()) { + lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.lintString, typedOther.lintString); + if (lastComparison != 0) { + return lastComparison; + } + } + lastComparison = Boolean.valueOf(isSetMStringString()).compareTo(typedOther.isSetMStringString()); + if (lastComparison != 0) { + return lastComparison; + } + if (isSetMStringString()) { + lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.mStringString, typedOther.mStringString); + if (lastComparison != 0) { + return lastComparison; + } + } + return 0; + } + + public _Fields fieldForId(int fieldId) { + return _Fields.findByThriftId(fieldId); + } + + public void read(org.apache.thrift.protocol.TProtocol iprot) throws org.apache.thrift.TException { + schemes.get(iprot.getScheme()).getScheme().read(iprot, this); + } + + public void write(org.apache.thrift.protocol.TProtocol oprot) throws org.apache.thrift.TException { + schemes.get(oprot.getScheme()).getScheme().write(oprot, this); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("Complex("); + boolean first = true; + + sb.append("aint:"); + sb.append(this.aint); + first = false; + if (!first) sb.append(", "); + sb.append("aString:"); + if (this.aString == null) { + sb.append("null"); + } else { + sb.append(this.aString); + } + first = false; + if (!first) sb.append(", "); + sb.append("lint:"); + if (this.lint == null) { + sb.append("null"); + } else { + sb.append(this.lint); + } + first = false; + if (!first) sb.append(", "); + sb.append("lString:"); + if (this.lString == null) { + sb.append("null"); + } else { + sb.append(this.lString); + } + first = false; + if (!first) sb.append(", "); + sb.append("lintString:"); + if (this.lintString == null) { + sb.append("null"); + } else { + sb.append(this.lintString); + } + first = false; + if (!first) sb.append(", "); + sb.append("mStringString:"); + if (this.mStringString == null) { + sb.append("null"); + } else { + sb.append(this.mStringString); + } + first = false; + sb.append(")"); + return sb.toString(); + } + + public void validate() throws org.apache.thrift.TException { + // check for required fields + // check for sub-struct validity + } + + private void writeObject(java.io.ObjectOutputStream out) throws java.io.IOException { + try { + write(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(out))); + } catch (org.apache.thrift.TException te) { + throw new java.io.IOException(te); + } + } + + private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException { + try { + // it doesn't seem like you should have to do this, but java serialization is wacky, and doesn't call the default constructor. + __isset_bitfield = 0; + read(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(in))); + } catch (org.apache.thrift.TException te) { + throw new java.io.IOException(te); + } + } + + private static class ComplexStandardSchemeFactory implements SchemeFactory { + public ComplexStandardScheme getScheme() { + return new ComplexStandardScheme(); + } + } + + private static class ComplexStandardScheme extends StandardScheme { + + public void read(org.apache.thrift.protocol.TProtocol iprot, Complex struct) throws org.apache.thrift.TException { + org.apache.thrift.protocol.TField schemeField; + iprot.readStructBegin(); + while (true) + { + schemeField = iprot.readFieldBegin(); + if (schemeField.type == org.apache.thrift.protocol.TType.STOP) { + break; + } + switch (schemeField.id) { + case 1: // AINT + if (schemeField.type == org.apache.thrift.protocol.TType.I32) { + struct.aint = iprot.readI32(); + struct.setAintIsSet(true); + } else { + org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type); + } + break; + case 2: // A_STRING + if (schemeField.type == org.apache.thrift.protocol.TType.STRING) { + struct.aString = iprot.readString(); + struct.setAStringIsSet(true); + } else { + org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type); + } + break; + case 3: // LINT + if (schemeField.type == org.apache.thrift.protocol.TType.LIST) { + { + org.apache.thrift.protocol.TList _list0 = iprot.readListBegin(); + struct.lint = new ArrayList(_list0.size); + for (int _i1 = 0; _i1 < _list0.size; ++_i1) + { + int _elem2; // required + _elem2 = iprot.readI32(); + struct.lint.add(_elem2); + } + iprot.readListEnd(); + } + struct.setLintIsSet(true); + } else { + org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type); + } + break; + case 4: // L_STRING + if (schemeField.type == org.apache.thrift.protocol.TType.LIST) { + { + org.apache.thrift.protocol.TList _list3 = iprot.readListBegin(); + struct.lString = new ArrayList(_list3.size); + for (int _i4 = 0; _i4 < _list3.size; ++_i4) + { + String _elem5; // required + _elem5 = iprot.readString(); + struct.lString.add(_elem5); + } + iprot.readListEnd(); + } + struct.setLStringIsSet(true); + } else { + org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type); + } + break; + case 5: // LINT_STRING + if (schemeField.type == org.apache.thrift.protocol.TType.LIST) { + { + org.apache.thrift.protocol.TList _list6 = iprot.readListBegin(); + struct.lintString = new ArrayList(_list6.size); + for (int _i7 = 0; _i7 < _list6.size; ++_i7) + { + IntString _elem8; // required + _elem8 = new IntString(); + _elem8.read(iprot); + struct.lintString.add(_elem8); + } + iprot.readListEnd(); + } + struct.setLintStringIsSet(true); + } else { + org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type); + } + break; + case 6: // M_STRING_STRING + if (schemeField.type == org.apache.thrift.protocol.TType.MAP) { + { + org.apache.thrift.protocol.TMap _map9 = iprot.readMapBegin(); + struct.mStringString = new HashMap(2*_map9.size); + for (int _i10 = 0; _i10 < _map9.size; ++_i10) + { + String _key11; // required + String _val12; // required + _key11 = iprot.readString(); + _val12 = iprot.readString(); + struct.mStringString.put(_key11, _val12); + } + iprot.readMapEnd(); + } + struct.setMStringStringIsSet(true); + } else { + org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type); + } + break; + default: + org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type); + } + iprot.readFieldEnd(); + } + iprot.readStructEnd(); + struct.validate(); + } + + public void write(org.apache.thrift.protocol.TProtocol oprot, Complex struct) throws org.apache.thrift.TException { + struct.validate(); + + oprot.writeStructBegin(STRUCT_DESC); + oprot.writeFieldBegin(AINT_FIELD_DESC); + oprot.writeI32(struct.aint); + oprot.writeFieldEnd(); + if (struct.aString != null) { + oprot.writeFieldBegin(A_STRING_FIELD_DESC); + oprot.writeString(struct.aString); + oprot.writeFieldEnd(); + } + if (struct.lint != null) { + oprot.writeFieldBegin(LINT_FIELD_DESC); + { + oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.I32, struct.lint.size())); + for (int _iter13 : struct.lint) + { + oprot.writeI32(_iter13); + } + oprot.writeListEnd(); + } + oprot.writeFieldEnd(); + } + if (struct.lString != null) { + oprot.writeFieldBegin(L_STRING_FIELD_DESC); + { + oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, struct.lString.size())); + for (String _iter14 : struct.lString) + { + oprot.writeString(_iter14); + } + oprot.writeListEnd(); + } + oprot.writeFieldEnd(); + } + if (struct.lintString != null) { + oprot.writeFieldBegin(LINT_STRING_FIELD_DESC); + { + oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.lintString.size())); + for (IntString _iter15 : struct.lintString) + { + _iter15.write(oprot); + } + oprot.writeListEnd(); + } + oprot.writeFieldEnd(); + } + if (struct.mStringString != null) { + oprot.writeFieldBegin(M_STRING_STRING_FIELD_DESC); + { + oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.STRING, struct.mStringString.size())); + for (Map.Entry _iter16 : struct.mStringString.entrySet()) + { + oprot.writeString(_iter16.getKey()); + oprot.writeString(_iter16.getValue()); + } + oprot.writeMapEnd(); + } + oprot.writeFieldEnd(); + } + oprot.writeFieldStop(); + oprot.writeStructEnd(); + } + + } + + private static class ComplexTupleSchemeFactory implements SchemeFactory { + public ComplexTupleScheme getScheme() { + return new ComplexTupleScheme(); + } + } + + private static class ComplexTupleScheme extends TupleScheme { + + @Override + public void write(org.apache.thrift.protocol.TProtocol prot, Complex struct) throws org.apache.thrift.TException { + TTupleProtocol oprot = (TTupleProtocol) prot; + BitSet optionals = new BitSet(); + if (struct.isSetAint()) { + optionals.set(0); + } + if (struct.isSetAString()) { + optionals.set(1); + } + if (struct.isSetLint()) { + optionals.set(2); + } + if (struct.isSetLString()) { + optionals.set(3); + } + if (struct.isSetLintString()) { + optionals.set(4); + } + if (struct.isSetMStringString()) { + optionals.set(5); + } + oprot.writeBitSet(optionals, 6); + if (struct.isSetAint()) { + oprot.writeI32(struct.aint); + } + if (struct.isSetAString()) { + oprot.writeString(struct.aString); + } + if (struct.isSetLint()) { + { + oprot.writeI32(struct.lint.size()); + for (int _iter17 : struct.lint) + { + oprot.writeI32(_iter17); + } + } + } + if (struct.isSetLString()) { + { + oprot.writeI32(struct.lString.size()); + for (String _iter18 : struct.lString) + { + oprot.writeString(_iter18); + } + } + } + if (struct.isSetLintString()) { + { + oprot.writeI32(struct.lintString.size()); + for (IntString _iter19 : struct.lintString) + { + _iter19.write(oprot); + } + } + } + if (struct.isSetMStringString()) { + { + oprot.writeI32(struct.mStringString.size()); + for (Map.Entry _iter20 : struct.mStringString.entrySet()) + { + oprot.writeString(_iter20.getKey()); + oprot.writeString(_iter20.getValue()); + } + } + } + } + + @Override + public void read(org.apache.thrift.protocol.TProtocol prot, Complex struct) throws org.apache.thrift.TException { + TTupleProtocol iprot = (TTupleProtocol) prot; + BitSet incoming = iprot.readBitSet(6); + if (incoming.get(0)) { + struct.aint = iprot.readI32(); + struct.setAintIsSet(true); + } + if (incoming.get(1)) { + struct.aString = iprot.readString(); + struct.setAStringIsSet(true); + } + if (incoming.get(2)) { + { + org.apache.thrift.protocol.TList _list21 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.I32, iprot.readI32()); + struct.lint = new ArrayList(_list21.size); + for (int _i22 = 0; _i22 < _list21.size; ++_i22) + { + int _elem23; // required + _elem23 = iprot.readI32(); + struct.lint.add(_elem23); + } + } + struct.setLintIsSet(true); + } + if (incoming.get(3)) { + { + org.apache.thrift.protocol.TList _list24 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32()); + struct.lString = new ArrayList(_list24.size); + for (int _i25 = 0; _i25 < _list24.size; ++_i25) + { + String _elem26; // required + _elem26 = iprot.readString(); + struct.lString.add(_elem26); + } + } + struct.setLStringIsSet(true); + } + if (incoming.get(4)) { + { + org.apache.thrift.protocol.TList _list27 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32()); + struct.lintString = new ArrayList(_list27.size); + for (int _i28 = 0; _i28 < _list27.size; ++_i28) + { + IntString _elem29; // required + _elem29 = new IntString(); + _elem29.read(iprot); + struct.lintString.add(_elem29); + } + } + struct.setLintStringIsSet(true); + } + if (incoming.get(5)) { + { + org.apache.thrift.protocol.TMap _map30 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.STRING, iprot.readI32()); + struct.mStringString = new HashMap(2*_map30.size); + for (int _i31 = 0; _i31 < _map30.size; ++_i31) + { + String _key32; // required + String _val33; // required + _key32 = iprot.readString(); + _val33 = iprot.readString(); + struct.mStringString.put(_key32, _val33); + } + } + struct.setMStringStringIsSet(true); + } + } + } + +} + diff --git a/sql/hive/src/test/resources/golden/! operator-0-ee7f6a60a9792041b85b18cda56429bf b/sql/hive/src/test/resources/golden/! operator-0-ee7f6a60a9792041b85b18cda56429bf new file mode 100644 index 000000000000..d00491fd7e5b --- /dev/null +++ b/sql/hive/src/test/resources/golden/! operator-0-ee7f6a60a9792041b85b18cda56429bf @@ -0,0 +1 @@ +1 diff --git a/sql/hive/src/test/resources/golden/Column pruning - non-trivial top project with aliases - query test-0-515e406ffb23f6fd0d8cd34c2b25fbe6 b/sql/hive/src/test/resources/golden/Column pruning - non-trivial top project with aliases - query test-0-515e406ffb23f6fd0d8cd34c2b25fbe6 new file mode 100644 index 000000000000..9a276bc794c0 --- /dev/null +++ b/sql/hive/src/test/resources/golden/Column pruning - non-trivial top project with aliases - query test-0-515e406ffb23f6fd0d8cd34c2b25fbe6 @@ -0,0 +1,3 @@ +476 +172 +622 diff --git a/sql/hive/src/test/resources/golden/Partition pruning - non-partitioned, non-trivial project - query test-0-eabbebd5c1d127b1605bfec52d7b7f3f b/sql/hive/src/test/resources/golden/Partition pruning - non-partitioned, non-trivial project - query test-0-eabbebd5c1d127b1605bfec52d7b7f3f new file mode 100644 index 000000000000..444039e75fba --- /dev/null +++ b/sql/hive/src/test/resources/golden/Partition pruning - non-partitioned, non-trivial project - query test-0-eabbebd5c1d127b1605bfec52d7b7f3f @@ -0,0 +1,500 @@ +476 +172 +622 +54 +330 +818 +510 +556 +196 +968 +530 +386 +802 +300 +546 +448 +738 +132 +256 +426 +292 +812 +858 +748 +304 +938 +290 +990 +74 +654 +562 +554 +418 +30 +164 +806 +332 +834 +860 +504 +584 +438 +574 +306 +386 +676 +892 +918 +788 +474 +964 +348 +826 +988 +414 +398 +932 +416 +348 +798 +792 +494 +834 +978 +324 +754 +794 +618 +730 +532 +878 +684 +734 +650 +334 +390 +950 +34 +226 +310 +406 +678 +0 +910 +256 +622 +632 +114 +604 +410 +298 +876 +690 +258 +340 +40 +978 +314 +756 +442 +184 +222 +94 +144 +8 +560 +70 +854 +554 +416 +712 +798 +338 +764 +996 +250 +772 +874 +938 +384 +572 +374 +352 +108 +918 +102 +276 +206 +478 +426 +432 +860 +556 +352 +578 +442 +130 +636 +664 +622 +550 +274 +482 +166 +666 +360 +568 +24 +460 +362 +134 +520 +808 +768 +978 +706 +746 +544 +276 +434 +168 +696 +932 +116 +16 +822 +460 +416 +696 +48 +926 +862 +358 +344 +84 +258 +316 +238 +992 +0 +644 +394 +936 +786 +908 +200 +596 +398 +382 +836 +192 +52 +330 +654 +460 +410 +240 +262 +102 +808 +86 +872 +312 +938 +936 +616 +190 +392 +576 +962 +914 +196 +564 +394 +374 +636 +636 +818 +940 +274 +738 +632 +338 +826 +170 +154 +0 +980 +174 +728 +358 +236 +268 +790 +564 +276 +476 +838 +30 +236 +144 +180 +614 +38 +870 +20 +554 +546 +612 +448 +618 +778 +654 +484 +738 +784 +544 +662 +802 +484 +904 +354 +452 +10 +994 +804 +792 +634 +790 +116 +70 +672 +190 +22 +336 +68 +458 +466 +286 +944 +644 +996 +320 +390 +84 +642 +860 +238 +978 +916 +156 +152 +82 +446 +984 +298 +898 +436 +456 +276 +906 +60 +418 +128 +936 +152 +148 +684 +138 +460 +66 +736 +206 +592 +226 +432 +734 +688 +334 +548 +438 +478 +970 +232 +446 +512 +526 +140 +974 +960 +802 +576 +382 +10 +488 +876 +256 +934 +864 +404 +632 +458 +938 +926 +560 +4 +70 +566 +662 +470 +160 +88 +386 +642 +670 +208 +932 +732 +350 +806 +966 +106 +210 +514 +812 +818 +380 +812 +802 +228 +516 +180 +406 +524 +696 +848 +24 +792 +402 +434 +328 +862 +908 +956 +596 +250 +862 +328 +848 +374 +764 +10 +140 +794 +960 +582 +48 +702 +510 +208 +140 +326 +876 +238 +828 +400 +982 +474 +878 +720 +496 +958 +610 +834 +398 +888 +240 +858 +338 +886 +646 +650 +554 +460 +956 +356 +936 +620 +634 +666 +986 +920 +414 +498 +530 +960 +166 +272 +706 +344 +428 +924 +466 +812 +266 +350 +378 +908 +750 +802 +842 +814 +768 +512 +52 +268 +134 +768 +758 +36 +924 +984 +200 +596 +18 +682 +996 +292 +916 +724 +372 +570 +696 +334 +36 +546 +366 +562 +688 +194 +938 +630 +168 +56 +74 +896 +304 +696 +614 +388 +828 +954 +444 +252 +180 +338 +806 +800 +400 +194 diff --git a/sql/hive/src/test/resources/golden/convert_enum_to_string-1-db089ff46f9826c7883198adacdfad59 b/sql/hive/src/test/resources/golden/convert_enum_to_string-1-db089ff46f9826c7883198adacdfad59 index d35bf9093ca9..2383bef94097 100644 --- a/sql/hive/src/test/resources/golden/convert_enum_to_string-1-db089ff46f9826c7883198adacdfad59 +++ b/sql/hive/src/test/resources/golden/convert_enum_to_string-1-db089ff46f9826c7883198adacdfad59 @@ -15,9 +15,9 @@ my_enum_structlist_map map from deserializer my_structlist array>> from deserializer my_enumlist array from deserializer -my_stringset struct<> from deserializer -my_enumset struct<> from deserializer -my_structset struct<> from deserializer +my_stringset array from deserializer +my_enumset array from deserializer +my_structset array>> from deserializer optionals struct<> from deserializer b string diff --git a/sql/hive/src/test/resources/golden/get_json_object #1-0-f01b340b5662c45bb5f1e3b7c6900e1f b/sql/hive/src/test/resources/golden/get_json_object #1-0-f01b340b5662c45bb5f1e3b7c6900e1f new file mode 100644 index 000000000000..1dcda4315a14 --- /dev/null +++ b/sql/hive/src/test/resources/golden/get_json_object #1-0-f01b340b5662c45bb5f1e3b7c6900e1f @@ -0,0 +1 @@ +{"store":{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],"basket":[[1,2,{"b":"y","a":"x"}],[3,4],[5,6]],"book":[{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference","price":8.95},{"author":"Herman Melville","title":"Moby Dick","category":"fiction","price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title":"The Lord of the Rings","category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}],"price":22.99,"isbn":"0-395-19395-8"}],"bicycle":{"price":19.95,"color":"red"}},"email":"amy@only_for_json_udf_test.net","owner":"amy","zip code":"94025","fb:testid":"1234"} diff --git a/sql/hive/src/test/resources/golden/get_json_object #10-0-f3f47d06d7c51d493d68112b0bd6c1fc b/sql/hive/src/test/resources/golden/get_json_object #10-0-f3f47d06d7c51d493d68112b0bd6c1fc new file mode 100644 index 000000000000..81c545efebe5 --- /dev/null +++ b/sql/hive/src/test/resources/golden/get_json_object #10-0-f3f47d06d7c51d493d68112b0bd6c1fc @@ -0,0 +1 @@ +1234 diff --git a/sql/hive/src/test/resources/golden/get_json_object #2-0-e84c2f8136919830fd665a278e4158a b/sql/hive/src/test/resources/golden/get_json_object #2-0-e84c2f8136919830fd665a278e4158a new file mode 100644 index 000000000000..99127db9e311 --- /dev/null +++ b/sql/hive/src/test/resources/golden/get_json_object #2-0-e84c2f8136919830fd665a278e4158a @@ -0,0 +1 @@ +amy {"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],"basket":[[1,2,{"b":"y","a":"x"}],[3,4],[5,6]],"book":[{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference","price":8.95},{"author":"Herman Melville","title":"Moby Dick","category":"fiction","price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title":"The Lord of the Rings","category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}],"price":22.99,"isbn":"0-395-19395-8"}],"bicycle":{"price":19.95,"color":"red"}} diff --git a/sql/hive/src/test/resources/golden/get_json_object #3-0-bf140c65c31f8d892ec23e41e16e58bb b/sql/hive/src/test/resources/golden/get_json_object #3-0-bf140c65c31f8d892ec23e41e16e58bb new file mode 100644 index 000000000000..0bc03998296a --- /dev/null +++ b/sql/hive/src/test/resources/golden/get_json_object #3-0-bf140c65c31f8d892ec23e41e16e58bb @@ -0,0 +1 @@ +{"price":19.95,"color":"red"} [{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference","price":8.95},{"author":"Herman Melville","title":"Moby Dick","category":"fiction","price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title":"The Lord of the Rings","category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}],"price":22.99,"isbn":"0-395-19395-8"}] diff --git a/sql/hive/src/test/resources/golden/get_json_object #4-0-f0bd902edc1990c9a6c65a6bb672c4d5 b/sql/hive/src/test/resources/golden/get_json_object #4-0-f0bd902edc1990c9a6c65a6bb672c4d5 new file mode 100644 index 000000000000..4f7e09bd3fa7 --- /dev/null +++ b/sql/hive/src/test/resources/golden/get_json_object #4-0-f0bd902edc1990c9a6c65a6bb672c4d5 @@ -0,0 +1 @@ +{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference","price":8.95} [{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference","price":8.95},{"author":"Herman Melville","title":"Moby Dick","category":"fiction","price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title":"The Lord of the Rings","category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}],"price":22.99,"isbn":"0-395-19395-8"}] diff --git a/sql/hive/src/test/resources/golden/get_json_object #5-0-3c09f4316a1533049aee8af749cdcab b/sql/hive/src/test/resources/golden/get_json_object #5-0-3c09f4316a1533049aee8af749cdcab new file mode 100644 index 000000000000..b2d212a597d9 --- /dev/null +++ b/sql/hive/src/test/resources/golden/get_json_object #5-0-3c09f4316a1533049aee8af749cdcab @@ -0,0 +1 @@ +reference ["reference","fiction","fiction"] ["0-553-21311-3","0-395-19395-8"] [{"age":25,"name":"bob"},{"age":26,"name":"jack"}] diff --git a/sql/hive/src/test/resources/golden/get_json_object #6-0-8334d1ddbe0f41fc7b80d4e6b45409da b/sql/hive/src/test/resources/golden/get_json_object #6-0-8334d1ddbe0f41fc7b80d4e6b45409da new file mode 100644 index 000000000000..21d88629fcdb --- /dev/null +++ b/sql/hive/src/test/resources/golden/get_json_object #6-0-8334d1ddbe0f41fc7b80d4e6b45409da @@ -0,0 +1 @@ +25 [25,26] diff --git a/sql/hive/src/test/resources/golden/get_json_object #7-0-40d7dff94b26a2e3f4ab71baee3d3ce0 b/sql/hive/src/test/resources/golden/get_json_object #7-0-40d7dff94b26a2e3f4ab71baee3d3ce0 new file mode 100644 index 000000000000..e60721e1dd24 --- /dev/null +++ b/sql/hive/src/test/resources/golden/get_json_object #7-0-40d7dff94b26a2e3f4ab71baee3d3ce0 @@ -0,0 +1 @@ +2 [[1,2,{"b":"y","a":"x"}],[3,4],[5,6]] 1 [1,2,{"b":"y","a":"x"}] [1,2,{"b":"y","a":"x"},3,4,5,6] y ["y"] diff --git a/sql/hive/src/test/resources/golden/get_json_object #8-0-180b4b6fdb26011fec05a7ca99fd9844 b/sql/hive/src/test/resources/golden/get_json_object #8-0-180b4b6fdb26011fec05a7ca99fd9844 new file mode 100644 index 000000000000..356fcdf7139b --- /dev/null +++ b/sql/hive/src/test/resources/golden/get_json_object #8-0-180b4b6fdb26011fec05a7ca99fd9844 @@ -0,0 +1 @@ +NULL NULL NULL NULL NULL NULL diff --git a/sql/hive/src/test/resources/golden/get_json_object #9-0-47c451a969d856f008f4d6b3d378d94b b/sql/hive/src/test/resources/golden/get_json_object #9-0-47c451a969d856f008f4d6b3d378d94b new file mode 100644 index 000000000000..ef4a39675ed6 --- /dev/null +++ b/sql/hive/src/test/resources/golden/get_json_object #9-0-47c451a969d856f008f4d6b3d378d94b @@ -0,0 +1 @@ +94025 diff --git a/sql/hive/src/test/resources/golden/parenthesis_star_by-5-6888c7f7894910538d82eefa23443189 b/sql/hive/src/test/resources/golden/parenthesis_star_by-5-41d474f5e6d7c61c36f74b4bec4e9e44 similarity index 100% rename from sql/hive/src/test/resources/golden/parenthesis_star_by-5-6888c7f7894910538d82eefa23443189 rename to sql/hive/src/test/resources/golden/parenthesis_star_by-5-41d474f5e6d7c61c36f74b4bec4e9e44 diff --git a/sql/hive/src/test/resources/golden/show_create_table_alter-3-2a91d52719cf4552ebeb867204552a26 b/sql/hive/src/test/resources/golden/show_create_table_alter-3-2a91d52719cf4552ebeb867204552a26 index 501bb6ab32f2..7bb2c0ab4398 100644 --- a/sql/hive/src/test/resources/golden/show_create_table_alter-3-2a91d52719cf4552ebeb867204552a26 +++ b/sql/hive/src/test/resources/golden/show_create_table_alter-3-2a91d52719cf4552ebeb867204552a26 @@ -1,4 +1,4 @@ -CREATE TABLE `tmp_showcrt1`( +CREATE TABLE `tmp_showcrt1`( `key` smallint, `value` float) COMMENT 'temporary table' diff --git a/sql/hive/src/test/resources/golden/show_create_table_db_table-4-b585371b624cbab2616a49f553a870a0 b/sql/hive/src/test/resources/golden/show_create_table_db_table-4-b585371b624cbab2616a49f553a870a0 index 90f8415a1c6b..3cc1a57ee3a4 100644 --- a/sql/hive/src/test/resources/golden/show_create_table_db_table-4-b585371b624cbab2616a49f553a870a0 +++ b/sql/hive/src/test/resources/golden/show_create_table_db_table-4-b585371b624cbab2616a49f553a870a0 @@ -1,4 +1,4 @@ -CREATE TABLE `tmp_feng.tmp_showcrt`( +CREATE TABLE `tmp_feng.tmp_showcrt`( `key` string, `value` int) ROW FORMAT SERDE diff --git a/sql/hive/src/test/resources/golden/show_create_table_delimited-1-2a91d52719cf4552ebeb867204552a26 b/sql/hive/src/test/resources/golden/show_create_table_delimited-1-2a91d52719cf4552ebeb867204552a26 index 4ee22e523031..b51c71a71f91 100644 --- a/sql/hive/src/test/resources/golden/show_create_table_delimited-1-2a91d52719cf4552ebeb867204552a26 +++ b/sql/hive/src/test/resources/golden/show_create_table_delimited-1-2a91d52719cf4552ebeb867204552a26 @@ -1,4 +1,4 @@ -CREATE TABLE `tmp_showcrt1`( +CREATE TABLE `tmp_showcrt1`( `key` int, `value` string, `newvalue` bigint) diff --git a/sql/hive/src/test/resources/golden/show_create_table_serde-1-2a91d52719cf4552ebeb867204552a26 b/sql/hive/src/test/resources/golden/show_create_table_serde-1-2a91d52719cf4552ebeb867204552a26 index 6fda2570b53f..29189e1d860a 100644 --- a/sql/hive/src/test/resources/golden/show_create_table_serde-1-2a91d52719cf4552ebeb867204552a26 +++ b/sql/hive/src/test/resources/golden/show_create_table_serde-1-2a91d52719cf4552ebeb867204552a26 @@ -1,4 +1,4 @@ -CREATE TABLE `tmp_showcrt1`( +CREATE TABLE `tmp_showcrt1`( `key` int, `value` string, `newvalue` bigint) diff --git a/sql/hive/src/test/resources/golden/show_functions-0-45a7762c39f1b0f26f076220e2764043 b/sql/hive/src/test/resources/golden/show_functions-0-45a7762c39f1b0f26f076220e2764043 index 3049cd6243ad..1b283db3e774 100644 --- a/sql/hive/src/test/resources/golden/show_functions-0-45a7762c39f1b0f26f076220e2764043 +++ b/sql/hive/src/test/resources/golden/show_functions-0-45a7762c39f1b0f26f076220e2764043 @@ -17,6 +17,7 @@ ^ abs acos +add_months and array array_contains @@ -29,6 +30,7 @@ base64 between bin case +cbrt ceil ceiling coalesce @@ -47,7 +49,11 @@ covar_samp create_union cume_dist current_database +current_date +current_timestamp +current_user date_add +date_format date_sub datediff day @@ -65,6 +71,7 @@ ewah_bitmap_empty ewah_bitmap_or exp explode +factorial field find_in_set first_value @@ -73,6 +80,7 @@ format_number from_unixtime from_utc_timestamp get_json_object +greatest hash hex histogram_numeric @@ -81,6 +89,7 @@ if in in_file index +initcap inline instr isnotnull @@ -88,10 +97,13 @@ isnull java_method json_tuple lag +last_day last_value lcase lead +least length +levenshtein like ln locate @@ -109,11 +121,15 @@ max min minute month +months_between named_struct negative +next_day ngrams noop +noopstreaming noopwithmap +noopwithmapstreaming not ntile nvl @@ -147,10 +163,14 @@ rpad rtrim second sentences +shiftleft +shiftright +shiftrightunsigned sign sin size sort_array +soundex space split sqrt @@ -170,6 +190,7 @@ to_unix_timestamp to_utc_timestamp translate trim +trunc ucase unbase64 unhex diff --git a/sql/hive/src/test/resources/golden/show_tblproperties-1-be4adb893c7f946ebd76a648ce3cc1ae b/sql/hive/src/test/resources/golden/show_tblproperties-1-be4adb893c7f946ebd76a648ce3cc1ae index 0f6cc6f44f1f..fdf701f96280 100644 --- a/sql/hive/src/test/resources/golden/show_tblproperties-1-be4adb893c7f946ebd76a648ce3cc1ae +++ b/sql/hive/src/test/resources/golden/show_tblproperties-1-be4adb893c7f946ebd76a648ce3cc1ae @@ -1 +1 @@ -Table tmpfoo does not have property: bar +Table default.tmpfoo does not have property: bar diff --git a/sql/hive/src/test/resources/golden/udf_date_add-1-efb60fcbd6d78ad35257fb1ec39ace2 b/sql/hive/src/test/resources/golden/udf_date_add-1-efb60fcbd6d78ad35257fb1ec39ace2 index 3c91e138d7bd..d8ec084f0b2b 100644 --- a/sql/hive/src/test/resources/golden/udf_date_add-1-efb60fcbd6d78ad35257fb1ec39ace2 +++ b/sql/hive/src/test/resources/golden/udf_date_add-1-efb60fcbd6d78ad35257fb1ec39ace2 @@ -1,5 +1,5 @@ date_add(start_date, num_days) - Returns the date that is num_days after start_date. start_date is a string in the format 'yyyy-MM-dd HH:mm:ss' or 'yyyy-MM-dd'. num_days is a number. The time part of start_date is ignored. Example: - > SELECT date_add('2009-30-07', 1) FROM src LIMIT 1; - '2009-31-07' + > SELECT date_add('2009-07-30', 1) FROM src LIMIT 1; + '2009-07-31' diff --git a/sql/hive/src/test/resources/golden/udf_date_sub-1-7efeb74367835ade71e5e42b22f8ced4 b/sql/hive/src/test/resources/golden/udf_date_sub-1-7efeb74367835ade71e5e42b22f8ced4 index 29d663f35c58..169c50003625 100644 --- a/sql/hive/src/test/resources/golden/udf_date_sub-1-7efeb74367835ade71e5e42b22f8ced4 +++ b/sql/hive/src/test/resources/golden/udf_date_sub-1-7efeb74367835ade71e5e42b22f8ced4 @@ -1,5 +1,5 @@ date_sub(start_date, num_days) - Returns the date that is num_days before start_date. start_date is a string in the format 'yyyy-MM-dd HH:mm:ss' or 'yyyy-MM-dd'. num_days is a number. The time part of start_date is ignored. Example: - > SELECT date_sub('2009-30-07', 1) FROM src LIMIT 1; - '2009-29-07' + > SELECT date_sub('2009-07-30', 1) FROM src LIMIT 1; + '2009-07-29' diff --git a/sql/hive/src/test/resources/golden/udf_datediff-1-34ae7a68b13c2bc9a89f61acf2edd4c5 b/sql/hive/src/test/resources/golden/udf_datediff-1-34ae7a68b13c2bc9a89f61acf2edd4c5 index 7ccaee7ad3bd..42197f7ad3e5 100644 --- a/sql/hive/src/test/resources/golden/udf_datediff-1-34ae7a68b13c2bc9a89f61acf2edd4c5 +++ b/sql/hive/src/test/resources/golden/udf_datediff-1-34ae7a68b13c2bc9a89f61acf2edd4c5 @@ -1,5 +1,5 @@ datediff(date1, date2) - Returns the number of days between date1 and date2 date1 and date2 are strings in the format 'yyyy-MM-dd HH:mm:ss' or 'yyyy-MM-dd'. The time parts are ignored.If date1 is earlier than date2, the result is negative. Example: - > SELECT datediff('2009-30-07', '2009-31-07') FROM src LIMIT 1; + > SELECT datediff('2009-07-30', '2009-07-31') FROM src LIMIT 1; 1 diff --git a/sql/hive/src/test/resources/golden/udf_day-0-c4c503756384ff1220222d84fd25e756 b/sql/hive/src/test/resources/golden/udf_day-0-c4c503756384ff1220222d84fd25e756 index d4017178b4e6..09703d10eab7 100644 --- a/sql/hive/src/test/resources/golden/udf_day-0-c4c503756384ff1220222d84fd25e756 +++ b/sql/hive/src/test/resources/golden/udf_day-0-c4c503756384ff1220222d84fd25e756 @@ -1 +1 @@ -day(date) - Returns the date of the month of date +day(param) - Returns the day of the month of date/timestamp, or day component of interval diff --git a/sql/hive/src/test/resources/golden/udf_day-1-87168babe1110fe4c38269843414ca4 b/sql/hive/src/test/resources/golden/udf_day-1-87168babe1110fe4c38269843414ca4 index 6135aafa5086..7c0ec1dc3be5 100644 --- a/sql/hive/src/test/resources/golden/udf_day-1-87168babe1110fe4c38269843414ca4 +++ b/sql/hive/src/test/resources/golden/udf_day-1-87168babe1110fe4c38269843414ca4 @@ -1,6 +1,9 @@ -day(date) - Returns the date of the month of date +day(param) - Returns the day of the month of date/timestamp, or day component of interval Synonyms: dayofmonth -date is a string in the format of 'yyyy-MM-dd HH:mm:ss' or 'yyyy-MM-dd'. -Example: - > SELECT day('2009-30-07', 1) FROM src LIMIT 1; +param can be one of: +1. A string in the format of 'yyyy-MM-dd HH:mm:ss' or 'yyyy-MM-dd'. +2. A date value +3. A timestamp value +4. A day-time interval valueExample: + > SELECT day('2009-07-30') FROM src LIMIT 1; 30 diff --git a/sql/hive/src/test/resources/golden/udf_dayofmonth-0-7b2caf942528656555cf19c261a18502 b/sql/hive/src/test/resources/golden/udf_dayofmonth-0-7b2caf942528656555cf19c261a18502 index 47a7018d9d5a..c37eb0ec2e96 100644 --- a/sql/hive/src/test/resources/golden/udf_dayofmonth-0-7b2caf942528656555cf19c261a18502 +++ b/sql/hive/src/test/resources/golden/udf_dayofmonth-0-7b2caf942528656555cf19c261a18502 @@ -1 +1 @@ -dayofmonth(date) - Returns the date of the month of date +dayofmonth(param) - Returns the day of the month of date/timestamp, or day component of interval diff --git a/sql/hive/src/test/resources/golden/udf_dayofmonth-1-ca24d07102ad264d79ff30c64a73a7e8 b/sql/hive/src/test/resources/golden/udf_dayofmonth-1-ca24d07102ad264d79ff30c64a73a7e8 index d9490e20a3b6..9e931f649914 100644 --- a/sql/hive/src/test/resources/golden/udf_dayofmonth-1-ca24d07102ad264d79ff30c64a73a7e8 +++ b/sql/hive/src/test/resources/golden/udf_dayofmonth-1-ca24d07102ad264d79ff30c64a73a7e8 @@ -1,6 +1,9 @@ -dayofmonth(date) - Returns the date of the month of date +dayofmonth(param) - Returns the day of the month of date/timestamp, or day component of interval Synonyms: day -date is a string in the format of 'yyyy-MM-dd HH:mm:ss' or 'yyyy-MM-dd'. -Example: - > SELECT dayofmonth('2009-30-07', 1) FROM src LIMIT 1; +param can be one of: +1. A string in the format of 'yyyy-MM-dd HH:mm:ss' or 'yyyy-MM-dd'. +2. A date value +3. A timestamp value +4. A day-time interval valueExample: + > SELECT dayofmonth('2009-07-30') FROM src LIMIT 1; 30 diff --git a/sql/hive/src/test/resources/golden/udf_if-0-b7ffa85b5785cccef2af1b285348cc2c b/sql/hive/src/test/resources/golden/udf_if-0-b7ffa85b5785cccef2af1b285348cc2c index 2cf0d9d61882..ce583fe81ff6 100644 --- a/sql/hive/src/test/resources/golden/udf_if-0-b7ffa85b5785cccef2af1b285348cc2c +++ b/sql/hive/src/test/resources/golden/udf_if-0-b7ffa85b5785cccef2af1b285348cc2c @@ -1 +1 @@ -There is no documentation for function 'if' +IF(expr1,expr2,expr3) - If expr1 is TRUE (expr1 <> 0 and expr1 <> NULL) then IF() returns expr2; otherwise it returns expr3. IF() returns a numeric or string value, depending on the context in which it is used. diff --git a/sql/hive/src/test/resources/golden/udf_if-1-30cf7f51f92b5684e556deff3032d49a b/sql/hive/src/test/resources/golden/udf_if-1-30cf7f51f92b5684e556deff3032d49a index 2cf0d9d61882..ce583fe81ff6 100644 --- a/sql/hive/src/test/resources/golden/udf_if-1-30cf7f51f92b5684e556deff3032d49a +++ b/sql/hive/src/test/resources/golden/udf_if-1-30cf7f51f92b5684e556deff3032d49a @@ -1 +1 @@ -There is no documentation for function 'if' +IF(expr1,expr2,expr3) - If expr1 is TRUE (expr1 <> 0 and expr1 <> NULL) then IF() returns expr2; otherwise it returns expr3. IF() returns a numeric or string value, depending on the context in which it is used. diff --git a/sql/hive/src/test/resources/golden/udf_if-1-b7ffa85b5785cccef2af1b285348cc2c b/sql/hive/src/test/resources/golden/udf_if-1-b7ffa85b5785cccef2af1b285348cc2c index 2cf0d9d61882..ce583fe81ff6 100644 --- a/sql/hive/src/test/resources/golden/udf_if-1-b7ffa85b5785cccef2af1b285348cc2c +++ b/sql/hive/src/test/resources/golden/udf_if-1-b7ffa85b5785cccef2af1b285348cc2c @@ -1 +1 @@ -There is no documentation for function 'if' +IF(expr1,expr2,expr3) - If expr1 is TRUE (expr1 <> 0 and expr1 <> NULL) then IF() returns expr2; otherwise it returns expr3. IF() returns a numeric or string value, depending on the context in which it is used. diff --git a/sql/hive/src/test/resources/golden/udf_if-2-30cf7f51f92b5684e556deff3032d49a b/sql/hive/src/test/resources/golden/udf_if-2-30cf7f51f92b5684e556deff3032d49a index 2cf0d9d61882..ce583fe81ff6 100644 --- a/sql/hive/src/test/resources/golden/udf_if-2-30cf7f51f92b5684e556deff3032d49a +++ b/sql/hive/src/test/resources/golden/udf_if-2-30cf7f51f92b5684e556deff3032d49a @@ -1 +1 @@ -There is no documentation for function 'if' +IF(expr1,expr2,expr3) - If expr1 is TRUE (expr1 <> 0 and expr1 <> NULL) then IF() returns expr2; otherwise it returns expr3. IF() returns a numeric or string value, depending on the context in which it is used. diff --git a/sql/hive/src/test/resources/golden/udf_minute-0-9a38997c1f41f4afe00faa0abc471aee b/sql/hive/src/test/resources/golden/udf_minute-0-9a38997c1f41f4afe00faa0abc471aee index 231e4f382566..06650592f8d3 100644 --- a/sql/hive/src/test/resources/golden/udf_minute-0-9a38997c1f41f4afe00faa0abc471aee +++ b/sql/hive/src/test/resources/golden/udf_minute-0-9a38997c1f41f4afe00faa0abc471aee @@ -1 +1 @@ -minute(date) - Returns the minute of date +minute(param) - Returns the minute component of the string/timestamp/interval diff --git a/sql/hive/src/test/resources/golden/udf_minute-1-16995573ac4f4a1b047ad6ee88699e48 b/sql/hive/src/test/resources/golden/udf_minute-1-16995573ac4f4a1b047ad6ee88699e48 index ea842ea174ae..08ddc19b84d8 100644 --- a/sql/hive/src/test/resources/golden/udf_minute-1-16995573ac4f4a1b047ad6ee88699e48 +++ b/sql/hive/src/test/resources/golden/udf_minute-1-16995573ac4f4a1b047ad6ee88699e48 @@ -1,6 +1,8 @@ -minute(date) - Returns the minute of date -date is a string in the format of 'yyyy-MM-dd HH:mm:ss' or 'HH:mm:ss'. -Example: +minute(param) - Returns the minute component of the string/timestamp/interval +param can be one of: +1. A string in the format of 'yyyy-MM-dd HH:mm:ss' or 'HH:mm:ss'. +2. A timestamp value +3. A day-time interval valueExample: > SELECT minute('2009-07-30 12:58:59') FROM src LIMIT 1; 58 > SELECT minute('12:58:59') FROM src LIMIT 1; diff --git a/sql/hive/src/test/resources/golden/udf_month-0-9a38997c1f41f4afe00faa0abc471aee b/sql/hive/src/test/resources/golden/udf_month-0-9a38997c1f41f4afe00faa0abc471aee index 231e4f382566..06650592f8d3 100644 --- a/sql/hive/src/test/resources/golden/udf_month-0-9a38997c1f41f4afe00faa0abc471aee +++ b/sql/hive/src/test/resources/golden/udf_month-0-9a38997c1f41f4afe00faa0abc471aee @@ -1 +1 @@ -minute(date) - Returns the minute of date +minute(param) - Returns the minute component of the string/timestamp/interval diff --git a/sql/hive/src/test/resources/golden/udf_month-1-16995573ac4f4a1b047ad6ee88699e48 b/sql/hive/src/test/resources/golden/udf_month-1-16995573ac4f4a1b047ad6ee88699e48 index ea842ea174ae..08ddc19b84d8 100644 --- a/sql/hive/src/test/resources/golden/udf_month-1-16995573ac4f4a1b047ad6ee88699e48 +++ b/sql/hive/src/test/resources/golden/udf_month-1-16995573ac4f4a1b047ad6ee88699e48 @@ -1,6 +1,8 @@ -minute(date) - Returns the minute of date -date is a string in the format of 'yyyy-MM-dd HH:mm:ss' or 'HH:mm:ss'. -Example: +minute(param) - Returns the minute component of the string/timestamp/interval +param can be one of: +1. A string in the format of 'yyyy-MM-dd HH:mm:ss' or 'HH:mm:ss'. +2. A timestamp value +3. A day-time interval valueExample: > SELECT minute('2009-07-30 12:58:59') FROM src LIMIT 1; 58 > SELECT minute('12:58:59') FROM src LIMIT 1; diff --git a/sql/hive/src/test/resources/golden/udf_std-1-6759bde0e50a3607b7c3fd5a93cbd027 b/sql/hive/src/test/resources/golden/udf_std-1-6759bde0e50a3607b7c3fd5a93cbd027 index d54ebfbd6fb1..a529b107ff21 100644 --- a/sql/hive/src/test/resources/golden/udf_std-1-6759bde0e50a3607b7c3fd5a93cbd027 +++ b/sql/hive/src/test/resources/golden/udf_std-1-6759bde0e50a3607b7c3fd5a93cbd027 @@ -1,2 +1,2 @@ std(x) - Returns the standard deviation of a set of numbers -Synonyms: stddev_pop, stddev +Synonyms: stddev, stddev_pop diff --git a/sql/hive/src/test/resources/golden/udf_stddev-1-18e1d598820013453fad45852e1a303d b/sql/hive/src/test/resources/golden/udf_stddev-1-18e1d598820013453fad45852e1a303d index 5f674788180e..ac3176a38254 100644 --- a/sql/hive/src/test/resources/golden/udf_stddev-1-18e1d598820013453fad45852e1a303d +++ b/sql/hive/src/test/resources/golden/udf_stddev-1-18e1d598820013453fad45852e1a303d @@ -1,2 +1,2 @@ stddev(x) - Returns the standard deviation of a set of numbers -Synonyms: stddev_pop, std +Synonyms: std, stddev_pop diff --git a/sql/hive/src/test/resources/golden/union3-0-6a8a35102de1b0b88c6721a704eb174d b/sql/hive/src/test/resources/golden/union3-0-99620f72f0282904846a596ca5b3e46c similarity index 100% rename from sql/hive/src/test/resources/golden/union3-0-6a8a35102de1b0b88c6721a704eb174d rename to sql/hive/src/test/resources/golden/union3-0-99620f72f0282904846a596ca5b3e46c diff --git a/sql/hive/src/test/resources/golden/union3-2-2a1dcd937f117f1955a169592b96d5f9 b/sql/hive/src/test/resources/golden/union3-2-90ca96ea59fd45cf0af8c020ae77c908 similarity index 100% rename from sql/hive/src/test/resources/golden/union3-2-2a1dcd937f117f1955a169592b96d5f9 rename to sql/hive/src/test/resources/golden/union3-2-90ca96ea59fd45cf0af8c020ae77c908 diff --git a/sql/hive/src/test/resources/golden/union3-3-8fc63f8edb2969a63cd4485f1867ba97 b/sql/hive/src/test/resources/golden/union3-3-72b149ccaef751bcfe55d5ca37cb5fd7 similarity index 100% rename from sql/hive/src/test/resources/golden/union3-3-8fc63f8edb2969a63cd4485f1867ba97 rename to sql/hive/src/test/resources/golden/union3-3-72b149ccaef751bcfe55d5ca37cb5fd7 diff --git a/sql/hive/src/test/resources/log4j.properties b/sql/hive/src/test/resources/log4j.properties index 92eaf1f2795b..fea3404769d9 100644 --- a/sql/hive/src/test/resources/log4j.properties +++ b/sql/hive/src/test/resources/log4j.properties @@ -48,9 +48,14 @@ log4j.logger.hive.log=OFF log4j.additivity.parquet.hadoop.ParquetRecordReader=false log4j.logger.parquet.hadoop.ParquetRecordReader=OFF +log4j.additivity.org.apache.parquet.hadoop.ParquetRecordReader=false +log4j.logger.org.apache.parquet.hadoop.ParquetRecordReader=OFF + +log4j.additivity.org.apache.parquet.hadoop.ParquetOutputCommitter=false +log4j.logger.org.apache.parquet.hadoop.ParquetOutputCommitter=OFF + log4j.additivity.hive.ql.metadata.Hive=false log4j.logger.hive.ql.metadata.Hive=OFF log4j.additivity.org.apache.hadoop.hive.ql.io.RCFile=false log4j.logger.org.apache.hadoop.hive.ql.io.RCFile=ERROR - diff --git a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/parenthesis_star_by.q b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/parenthesis_star_by.q index 9e036c1a91d3..e911fbf2d2c5 100644 --- a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/parenthesis_star_by.q +++ b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/parenthesis_star_by.q @@ -5,6 +5,6 @@ SELECT * FROM (SELECT key, value FROM src DISTRIBUTE BY key, value)t ORDER BY ke SELECT key, value FROM src CLUSTER BY (key, value); -SELECT key, value FROM src ORDER BY (key ASC, value ASC); +SELECT key, value FROM src ORDER BY key ASC, value ASC; SELECT key, value FROM src SORT BY (key, value); SELECT * FROM (SELECT key, value FROM src DISTRIBUTE BY (key, value))t ORDER BY key, value; diff --git a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/union3.q b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/union3.q index b26a2e2799f7..a989800cbf85 100644 --- a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/union3.q +++ b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/union3.q @@ -1,42 +1,41 @@ +-- SORT_QUERY_RESULTS explain SELECT * FROM ( SELECT 1 AS id FROM (SELECT * FROM src LIMIT 1) s1 - CLUSTER BY id UNION ALL SELECT 2 AS id FROM (SELECT * FROM src LIMIT 1) s1 - CLUSTER BY id UNION ALL SELECT 3 AS id FROM (SELECT * FROM src LIMIT 1) s2 UNION ALL SELECT 4 AS id FROM (SELECT * FROM src LIMIT 1) s2 + CLUSTER BY id ) a; CREATE TABLE union_out (id int); -insert overwrite table union_out +insert overwrite table union_out SELECT * FROM ( SELECT 1 AS id FROM (SELECT * FROM src LIMIT 1) s1 - CLUSTER BY id UNION ALL SELECT 2 AS id FROM (SELECT * FROM src LIMIT 1) s1 - CLUSTER BY id UNION ALL SELECT 3 AS id FROM (SELECT * FROM src LIMIT 1) s2 UNION ALL SELECT 4 AS id FROM (SELECT * FROM src LIMIT 1) s2 + CLUSTER BY id ) a; -select * from union_out cluster by id; +select * from union_out; diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ClasspathDependenciesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ClasspathDependenciesSuite.scala new file mode 100644 index 000000000000..34b2edb44b03 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ClasspathDependenciesSuite.scala @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import java.net.URL + +import org.apache.spark.SparkFunSuite + +/** + * Verify that some classes load and that others are not found on the classpath. + * + * + * This is used to detect classpath and shading conflict, especially between + * Spark's required Kryo version and that which can be found in some Hive versions. + */ +class ClasspathDependenciesSuite extends SparkFunSuite { + private val classloader = this.getClass.getClassLoader + + private def assertLoads(classname: String): Unit = { + val resourceURL: URL = Option(findResource(classname)).getOrElse { + fail(s"Class $classname not found as ${resourceName(classname)}") + } + + logInfo(s"Class $classname at $resourceURL") + classloader.loadClass(classname) + } + + private def assertLoads(classes: String*): Unit = { + classes.foreach(assertLoads) + } + + private def findResource(classname: String): URL = { + val resource = resourceName(classname) + classloader.getResource(resource) + } + + private def resourceName(classname: String): String = { + classname.replace(".", "/") + ".class" + } + + private def assertClassNotFound(classname: String): Unit = { + Option(findResource(classname)).foreach { resourceURL => + fail(s"Class $classname found at $resourceURL") + } + + intercept[ClassNotFoundException] { + classloader.loadClass(classname) + } + } + + private def assertClassNotFound(classes: String*): Unit = { + classes.foreach(assertClassNotFound) + } + + private val KRYO = "com.esotericsoftware.kryo.Kryo" + + private val SPARK_HIVE = "org.apache.hive." + private val SPARK_SHADED = "org.spark-project.hive.shaded." + + test("shaded Protobuf") { + assertLoads(SPARK_SHADED + "com.google.protobuf.ServiceException") + } + + test("hive-common") { + assertLoads("org.apache.hadoop.hive.conf.HiveConf") + } + + test("hive-exec") { + assertLoads("org.apache.hadoop.hive.ql.CommandNeedRetryException") + } + + private val STD_INSTANTIATOR = "org.objenesis.strategy.StdInstantiatorStrategy" + + test("unshaded kryo") { + assertLoads(KRYO, STD_INSTANTIATOR) + } + + test("Forbidden Dependencies") { + assertClassNotFound( + SPARK_HIVE + KRYO, + SPARK_SHADED + KRYO, + "org.apache.hive." + KRYO, + "com.esotericsoftware.shaded." + STD_INSTANTIATOR, + SPARK_HIVE + "com.esotericsoftware.shaded." + STD_INSTANTIATOR, + "org.apache.hive.com.esotericsoftware.shaded." + STD_INSTANTIATOR + ) + } + + test("parquet-hadoop-bundle") { + assertLoads( + "parquet.hadoop.ParquetOutputFormat", + "parquet.hadoop.ParquetInputFormat" + ) + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala index 99e95fb92130..81a70b8d4226 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala @@ -133,8 +133,8 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors { } } - def checkValues(row1: Seq[Any], row2: InternalRow): Unit = { - row1.zip(row2.toSeq).foreach { case (r1, r2) => + def checkValues(row1: Seq[Any], row2: InternalRow, row2Schema: StructType): Unit = { + row1.zip(row2.toSeq(row2Schema)).foreach { case (r1, r2) => checkValue(r1, r2) } } @@ -211,8 +211,10 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors { case (t, idx) => StructField(s"c_$idx", t) }) val inspector = toInspector(dt) - checkValues(row, - unwrap(wrap(InternalRow.fromSeq(row), inspector, dt), inspector).asInstanceOf[InternalRow]) + checkValues( + row, + unwrap(wrap(InternalRow.fromSeq(row), inspector, dt), inspector).asInstanceOf[InternalRow], + dt) checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt))) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index 983c013bcf86..574624d501f2 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -17,31 +17,146 @@ package org.apache.spark.sql.hive -import org.apache.spark.{Logging, SparkFunSuite} +import java.io.File + +import org.apache.spark.sql.hive.client.{ExternalTable, ManagedTable} import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.hive.test.TestHive._ +import org.apache.spark.sql.hive.test.TestHive.implicits._ +import org.apache.spark.sql.sources.DataSourceTest +import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils} +import org.apache.spark.sql.types.{DecimalType, StringType, StructType} +import org.apache.spark.sql.{Row, SaveMode, SQLContext} +import org.apache.spark.{Logging, SparkFunSuite} -import org.apache.spark.sql.test.ExamplePointUDT -import org.apache.spark.sql.types.StructType class HiveMetastoreCatalogSuite extends SparkFunSuite with Logging { test("struct field should accept underscore in sub-column name") { - val metastr = "struct" - - val datatype = HiveMetastoreTypes.toDataType(metastr) - assert(datatype.isInstanceOf[StructType]) + val hiveTypeStr = "struct" + val dateType = HiveMetastoreTypes.toDataType(hiveTypeStr) + assert(dateType.isInstanceOf[StructType]) } test("udt to metastore type conversion") { val udt = new ExamplePointUDT - assert(HiveMetastoreTypes.toMetastoreType(udt) === - HiveMetastoreTypes.toMetastoreType(udt.sqlType)) + assertResult(HiveMetastoreTypes.toMetastoreType(udt.sqlType)) { + HiveMetastoreTypes.toMetastoreType(udt) + } } test("duplicated metastore relations") { - import TestHive.implicits._ - val df = TestHive.sql("SELECT * FROM src") + val df = sql("SELECT * FROM src") logInfo(df.queryExecution.toString) df.as('a).join(df.as('b), $"a.key" === $"b.key") } } + +class DataSourceWithHiveMetastoreCatalogSuite extends DataSourceTest with SQLTestUtils { + override def _sqlContext: SQLContext = TestHive + import testImplicits._ + + private val testDF = range(1, 3).select( + ('id + 0.1) cast DecimalType(10, 3) as 'd1, + 'id cast StringType as 'd2 + ).coalesce(1) + + Seq( + "parquet" -> ( + "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", + "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", + "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe" + ), + + "orc" -> ( + "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", + "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", + "org.apache.hadoop.hive.ql.io.orc.OrcSerde" + ) + ).foreach { case (provider, (inputFormat, outputFormat, serde)) => + test(s"Persist non-partitioned $provider relation into metastore as managed table") { + withTable("t") { + testDF + .write + .mode(SaveMode.Overwrite) + .format(provider) + .saveAsTable("t") + + val hiveTable = catalog.client.getTable("default", "t") + assert(hiveTable.inputFormat === Some(inputFormat)) + assert(hiveTable.outputFormat === Some(outputFormat)) + assert(hiveTable.serde === Some(serde)) + + assert(!hiveTable.isPartitioned) + assert(hiveTable.tableType === ManagedTable) + + val columns = hiveTable.schema + assert(columns.map(_.name) === Seq("d1", "d2")) + assert(columns.map(_.hiveType) === Seq("decimal(10,3)", "string")) + + checkAnswer(table("t"), testDF) + assert(runSqlHive("SELECT * FROM t") === Seq("1.1\t1", "2.1\t2")) + } + } + + test(s"Persist non-partitioned $provider relation into metastore as external table") { + withTempPath { dir => + withTable("t") { + val path = dir.getCanonicalFile + + testDF + .write + .mode(SaveMode.Overwrite) + .format(provider) + .option("path", path.toString) + .saveAsTable("t") + + val hiveTable = catalog.client.getTable("default", "t") + assert(hiveTable.inputFormat === Some(inputFormat)) + assert(hiveTable.outputFormat === Some(outputFormat)) + assert(hiveTable.serde === Some(serde)) + + assert(hiveTable.tableType === ExternalTable) + assert(hiveTable.location.get === path.toURI.toString.stripSuffix(File.separator)) + + val columns = hiveTable.schema + assert(columns.map(_.name) === Seq("d1", "d2")) + assert(columns.map(_.hiveType) === Seq("decimal(10,3)", "string")) + + checkAnswer(table("t"), testDF) + assert(runSqlHive("SELECT * FROM t") === Seq("1.1\t1", "2.1\t2")) + } + } + } + + test(s"Persist non-partitioned $provider relation into metastore as managed table using CTAS") { + withTempPath { dir => + withTable("t") { + val path = dir.getCanonicalPath + + sql( + s"""CREATE TABLE t USING $provider + |OPTIONS (path '$path') + |AS SELECT 1 AS d1, "val_1" AS d2 + """.stripMargin) + + val hiveTable = catalog.client.getTable("default", "t") + assert(hiveTable.inputFormat === Some(inputFormat)) + assert(hiveTable.outputFormat === Some(outputFormat)) + assert(hiveTable.serde === Some(serde)) + + assert(hiveTable.isPartitioned === false) + assert(hiveTable.tableType === ExternalTable) + assert(hiveTable.partitionColumns.length === 0) + + val columns = hiveTable.schema + assert(columns.map(_.name) === Seq("d1", "d2")) + assert(columns.map(_.hiveType) === Seq("int", "string")) + + checkAnswer(table("t"), Row(1, "val_1")) + assert(runSqlHive("SELECT * FROM t") === Seq("1\tval_1")) + } + } + } + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala index a45c2d957278..fe0db5228de1 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala @@ -18,15 +18,14 @@ package org.apache.spark.sql.hive import org.apache.spark.sql.hive.test.TestHive -import org.apache.spark.sql.parquet.ParquetTest -import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.execution.datasources.parquet.ParquetTest +import org.apache.spark.sql.{QueryTest, Row, SQLContext} case class Cases(lower: String, UPPER: String) class HiveParquetSuite extends QueryTest with ParquetTest { - val sqlContext = TestHive - - import sqlContext._ + private val ctx = TestHive + override def _sqlContext: SQLContext = ctx test("Case insensitive attribute names") { withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") { @@ -54,7 +53,7 @@ class HiveParquetSuite extends QueryTest with ParquetTest { test("Converting Hive to Parquet Table via saveAsParquetFile") { withTempPath { dir => sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath) - read.parquet(dir.getCanonicalPath).registerTempTable("p") + ctx.read.parquet(dir.getCanonicalPath).registerTempTable("p") withTempTable("p") { checkAnswer( sql("SELECT * FROM src ORDER BY key"), @@ -67,7 +66,7 @@ class HiveParquetSuite extends QueryTest with ParquetTest { withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") { withTempPath { file => sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath) - read.parquet(file.getCanonicalPath).registerTempTable("p") + ctx.read.parquet(file.getCanonicalPath).registerTempTable("p") withTempTable("p") { // let's do three overwrites for good measure sql("INSERT OVERWRITE TABLE p SELECT * FROM t") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala index f765395e148a..79cf40aba4bf 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala @@ -175,4 +175,19 @@ class HiveQlSuite extends SparkFunSuite with BeforeAndAfterAll { assert(desc.serde == Option("org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe")) assert(desc.properties == Map(("tbl_p1" -> "p11"), ("tbl_p2" -> "p22"))) } + + test("Invalid interval term should throw AnalysisException") { + def assertError(sql: String, errorMessage: String): Unit = { + val e = intercept[AnalysisException] { + HiveQl.parseSql(sql) + } + assert(e.getMessage.contains(errorMessage)) + } + assertError("select interval '42-32' year to month", + "month 32 outside range [0, 11]") + assertError("select interval '5 49:12:15' day to second", + "hour 49 outside range [0, 23]") + assertError("select interval '.1111111111' second", + "nanosecond 1111111111 outside range") + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index 72b35959a491..dc2d85f48624 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -18,22 +18,31 @@ package org.apache.spark.sql.hive import java.io.File +import java.sql.Timestamp +import java.util.Date -import scala.sys.process.{ProcessLogger, Process} +import scala.collection.mutable.ArrayBuffer -import org.apache.spark._ -import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext} -import org.apache.spark.util.{ResetSystemProperties, Utils} import org.scalatest.Matchers import org.scalatest.concurrent.Timeouts +import org.scalatest.exceptions.TestFailedDueToTimeoutException import org.scalatest.time.SpanSugar._ +import org.apache.spark._ +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext} +import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer +import org.apache.spark.sql.types.DecimalType +import org.apache.spark.util.{ResetSystemProperties, Utils} + /** * This suite tests spark-submit with applications using HiveContext. */ class HiveSparkSubmitSuite extends SparkFunSuite with Matchers + // This test suite sometimes gets extremely slow out of unknown reason on Jenkins. Here we + // add a timestamp to provide more diagnosis information. with ResetSystemProperties with Timeouts { @@ -47,13 +56,15 @@ class HiveSparkSubmitSuite val unusedJar = TestUtils.createJarWithClasses(Seq.empty) val jar1 = TestUtils.createJarWithClasses(Seq("SparkSubmitClassA")) val jar2 = TestUtils.createJarWithClasses(Seq("SparkSubmitClassB")) - val jar3 = TestHive.getHiveFile("hive-contrib-0.13.1.jar").getCanonicalPath() - val jar4 = TestHive.getHiveFile("hive-hcatalog-core-0.13.1.jar").getCanonicalPath() + val jar3 = TestHive.getHiveFile("hive-contrib-0.13.1.jar").getCanonicalPath + val jar4 = TestHive.getHiveFile("hive-hcatalog-core-0.13.1.jar").getCanonicalPath val jarsString = Seq(jar1, jar2, jar3, jar4).map(j => j.toString).mkString(",") val args = Seq( "--class", SparkSubmitClassLoaderTest.getClass.getName.stripSuffix("$"), "--name", "SparkSubmitClassLoaderTest", "--master", "local-cluster[2,1,1024]", + "--conf", "spark.ui.enabled=false", + "--conf", "spark.master.rest.enabled=false", "--jars", jarsString, unusedJar.toString, "SparkSubmitClassA", "SparkSubmitClassB") runSparkSubmit(args) @@ -65,6 +76,8 @@ class HiveSparkSubmitSuite "--class", SparkSQLConfTest.getClass.getName.stripSuffix("$"), "--name", "SparkSQLConfTest", "--master", "local-cluster[2,1,1024]", + "--conf", "spark.ui.enabled=false", + "--conf", "spark.master.rest.enabled=false", unusedJar.toString) runSparkSubmit(args) } @@ -76,7 +89,21 @@ class HiveSparkSubmitSuite // the HiveContext code mistakenly overrides the class loader that contains user classes. // For more detail, see sql/hive/src/test/resources/regression-test-SPARK-8489/*scala. val testJar = "sql/hive/src/test/resources/regression-test-SPARK-8489/test.jar" - val args = Seq("--class", "Main", testJar) + val args = Seq( + "--conf", "spark.ui.enabled=false", + "--conf", "spark.master.rest.enabled=false", + "--class", "Main", + testJar) + runSparkSubmit(args) + } + + test("SPARK-9757 Persist Parquet relation with decimal column") { + val unusedJar = TestUtils.createJarWithClasses(Seq.empty) + val args = Seq( + "--class", SPARK_9757.getClass.getName.stripSuffix("$"), + "--name", "SparkSQLConfTest", + "--master", "local-cluster[2,1,1024]", + unusedJar.toString) runSparkSubmit(args) } @@ -84,23 +111,55 @@ class HiveSparkSubmitSuite // This is copied from org.apache.spark.deploy.SparkSubmitSuite private def runSparkSubmit(args: Seq[String]): Unit = { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) - val process = Process( - Seq("./bin/spark-submit") ++ args, - new File(sparkHome), - "SPARK_TESTING" -> "1", - "SPARK_HOME" -> sparkHome - ).run(ProcessLogger( + val history = ArrayBuffer.empty[String] + val commands = Seq("./bin/spark-submit") ++ args + val commandLine = commands.mkString("'", "' '", "'") + + val builder = new ProcessBuilder(commands: _*).directory(new File(sparkHome)) + val env = builder.environment() + env.put("SPARK_TESTING", "1") + env.put("SPARK_HOME", sparkHome) + + def captureOutput(source: String)(line: String): Unit = { + // This test suite has some weird behaviors when executed on Jenkins: + // + // 1. Sometimes it gets extremely slow out of unknown reason on Jenkins. Here we add a + // timestamp to provide more diagnosis information. + // 2. Log lines are not correctly redirected to unit-tests.log as expected, so here we print + // them out for debugging purposes. + val logLine = s"${new Timestamp(new Date().getTime)} - $source> $line" // scalastyle:off println - (line: String) => { println(s"out> $line") }, - (line: String) => { println(s"err> $line") } + println(logLine) // scalastyle:on println - )) + history += logLine + } + + val process = builder.start() + new ProcessOutputCapturer(process.getInputStream, captureOutput("stdout")).start() + new ProcessOutputCapturer(process.getErrorStream, captureOutput("stderr")).start() try { - val exitCode = failAfter(180 seconds) { process.exitValue() } + val exitCode = failAfter(180.seconds) { process.waitFor() } if (exitCode != 0) { - fail(s"Process returned with exit code $exitCode. See the log4j logs for more detail.") + // include logs in output. Note that logging is async and may not have completed + // at the time this exception is raised + Thread.sleep(1000) + val historyLog = history.mkString("\n") + fail { + s"""spark-submit returned with exit code $exitCode. + |Command line: $commandLine + | + |$historyLog + """.stripMargin + } } + } catch { + case to: TestFailedDueToTimeoutException => + val historyLog = history.mkString("\n") + fail(s"Timeout of $commandLine" + + s" See the log4j logs for more detail." + + s"\n$historyLog", to) + case t: Throwable => throw t } finally { // Ensure we still kill the process in case it timed out process.destroy() @@ -186,7 +245,7 @@ object SparkSQLConfTest extends Logging { // before spark.sql.hive.metastore.jars get set, we will see the following exception: // Exception in thread "main" java.lang.IllegalArgumentException: Builtin jars can only // be used when hive execution version == hive metastore version. - // Execution: 0.13.1 != Metastore: 0.12. Specify a vaild path to the correct hive jars + // Execution: 0.13.1 != Metastore: 0.12. Specify a valid path to the correct hive jars // using $HIVE_METASTORE_JARS or change spark.sql.hive.metastore.version to 0.13.1. val conf = new SparkConf() { override def getAll: Array[(String, String)] = { @@ -212,3 +271,46 @@ object SparkSQLConfTest extends Logging { sc.stop() } } + +object SPARK_9757 extends QueryTest with Logging { + def main(args: Array[String]): Unit = { + Utils.configTestLog4j("INFO") + + val sparkContext = new SparkContext( + new SparkConf() + .set("spark.sql.hive.metastore.version", "0.13.1") + .set("spark.sql.hive.metastore.jars", "maven")) + + val hiveContext = new TestHiveContext(sparkContext) + import hiveContext.implicits._ + + import org.apache.spark.sql.functions._ + + val dir = Utils.createTempDir() + dir.delete() + + try { + { + val df = + hiveContext + .range(10) + .select(('id + 0.1) cast DecimalType(10, 3) as 'dec) + df.write.option("path", dir.getCanonicalPath).mode("overwrite").saveAsTable("t") + checkAnswer(hiveContext.table("t"), df) + } + + { + val df = + hiveContext + .range(10) + .select(callUDF("struct", ('id + 0.2) cast DecimalType(10, 3)) as 'dec_struct) + df.write.option("path", dir.getCanonicalPath).mode("overwrite").saveAsTable("t") + checkAnswer(hiveContext.table("t"), df) + } + } finally { + dir.delete() + hiveContext.sql("DROP TABLE t") + sparkContext.stop() + } + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala index 508695919e9a..d33e81227db8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive import java.io.File +import org.apache.hadoop.hive.conf.HiveConf import org.scalatest.BeforeAndAfter import org.apache.spark.sql.execution.QueryExecutionException @@ -113,6 +114,8 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter { test("SPARK-4203:random partition directory order") { sql("CREATE TABLE tmp_table (key int, value string)") val tmpDir = Utils.createTempDir() + val stagingDir = new HiveConf().getVar(HiveConf.ConfVars.STAGINGDIR) + sql( s""" |CREATE TABLE table_with_partition(c1 string) @@ -145,7 +148,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter { """.stripMargin) def listFolders(path: File, acc: List[String]): List[List[String]] = { val dir = path.listFiles() - val folders = dir.filter(_.isDirectory).toList + val folders = dir.filter { e => e.isDirectory && !e.getName().startsWith(stagingDir) }.toList if (folders.isEmpty) { List(acc.reverse) } else { @@ -158,7 +161,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter { "p1=a"::"p2=b"::"p3=c"::"p4=c"::"p5=1"::Nil , "p1=a"::"p2=b"::"p3=c"::"p4=c"::"p5=4"::Nil ) - assert(listFolders(tmpDir, List()).sortBy(_.toString()) == expected.sortBy(_.toString)) + assert(listFolders(tmpDir, List()).sortBy(_.toString()) === expected.sortBy(_.toString)) sql("DROP TABLE table_with_partition") sql("DROP TABLE tmp_table") } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala index 1c15997ea8e6..d3388a9429e4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala @@ -34,7 +34,6 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll { override def beforeAll(): Unit = { // The catalog in HiveContext is a case insensitive one. catalog.registerTable(Seq("ListTablesSuiteTable"), df.logicalPlan) - catalog.registerTable(Seq("ListTablesSuiteDB", "InDBListTablesSuiteTable"), df.logicalPlan) sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)") sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB") sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)") @@ -42,7 +41,6 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll { override def afterAll(): Unit = { catalog.unregisterTable(Seq("ListTablesSuiteTable")) - catalog.unregisterTable(Seq("ListTablesSuiteDB", "InDBListTablesSuiteTable")) sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable") sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable") sql("DROP DATABASE IF EXISTS ListTablesSuiteDB") @@ -55,7 +53,6 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll { checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("listtablessuitetable", true)) - assert(allTables.filter("tableName = 'indblisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hivelisttablessuitetable'"), Row("hivelisttablessuitetable", false)) @@ -69,9 +66,6 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll { checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("listtablessuitetable", true)) - checkAnswer( - allTables.filter("tableName = 'indblisttablessuitetable'"), - Row("indblisttablessuitetable", true)) assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hiveindblisttablessuitetable'"), diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 4fdf774ead75..20a50586d520 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -17,12 +17,11 @@ package org.apache.spark.sql.hive -import java.io.File +import java.io.{IOException, File} import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path -import org.apache.hadoop.mapred.InvalidInputException import org.scalatest.BeforeAndAfterAll import org.apache.spark.Logging @@ -32,7 +31,7 @@ import org.apache.spark.sql.hive.client.{HiveTable, ManagedTable} import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ -import org.apache.spark.sql.parquet.ParquetRelation +import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ import org.apache.spark.util.Utils @@ -42,7 +41,8 @@ import org.apache.spark.util.Utils */ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll with Logging { - override val sqlContext = TestHive + override def _sqlContext: SQLContext = TestHive + private val sqlContext = _sqlContext var jsonFilePath: String = _ @@ -463,23 +463,20 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA checkAnswer(sql("SELECT * FROM savedJsonTable"), df) - // Right now, we cannot append to an existing JSON table. - intercept[RuntimeException] { - df.write.mode(SaveMode.Append).saveAsTable("savedJsonTable") - } - // We can overwrite it. df.write.mode(SaveMode.Overwrite).saveAsTable("savedJsonTable") checkAnswer(sql("SELECT * FROM savedJsonTable"), df) // When the save mode is Ignore, we will do nothing when the table already exists. df.select("b").write.mode(SaveMode.Ignore).saveAsTable("savedJsonTable") - assert(df.schema === table("savedJsonTable").schema) + // TODO in ResolvedDataSource, will convert the schema into nullable = true + // hence the df.schema is not exactly the same as table("savedJsonTable").schema + // assert(df.schema === table("savedJsonTable").schema) checkAnswer(sql("SELECT * FROM savedJsonTable"), df) // Drop table will also delete the data. sql("DROP TABLE savedJsonTable") - intercept[InvalidInputException] { + intercept[IOException] { read.json(catalog.hiveDefaultTableFilePath("savedJsonTable")) } } @@ -555,7 +552,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA "org.apache.spark.sql.json", schema, Map.empty[String, String]) - }.getMessage.contains("'path' must be specified for json data."), + }.getMessage.contains("key not found: path"), "We should complain that path is not specified.") } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala index 73852f13ad20..997c667ec0d1 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala @@ -19,15 +19,22 @@ package org.apache.spark.sql.hive import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.test.SQLTestUtils -import org.apache.spark.sql.{QueryTest, SQLContext, SaveMode} +import org.apache.spark.sql.{AnalysisException, QueryTest, SQLContext, SaveMode} class MultiDatabaseSuite extends QueryTest with SQLTestUtils { - override val sqlContext: SQLContext = TestHive - - import sqlContext.sql + override val _sqlContext: HiveContext = TestHive + private val sqlContext = _sqlContext private val df = sqlContext.range(10).coalesce(1) + private def checkTablePath(dbName: String, tableName: String): Unit = { + // val hiveContext = sqlContext.asInstanceOf[HiveContext] + val metastoreTable = sqlContext.catalog.client.getTable(dbName, tableName) + val expectedPath = sqlContext.catalog.client.getDatabase(dbName).location + "/" + tableName + + assert(metastoreTable.serdeProperties("path") === expectedPath) + } + test(s"saveAsTable() to non-default database - with USE - Overwrite") { withTempDatabase { db => activateDatabase(db) { @@ -38,6 +45,8 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils { assert(sqlContext.tableNames(db).contains("t")) checkAnswer(sqlContext.table(s"$db.t"), df) + + checkTablePath(db, "t") } } @@ -46,6 +55,58 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils { df.write.mode(SaveMode.Overwrite).saveAsTable(s"$db.t") assert(sqlContext.tableNames(db).contains("t")) checkAnswer(sqlContext.table(s"$db.t"), df) + + checkTablePath(db, "t") + } + } + + test(s"createExternalTable() to non-default database - with USE") { + withTempDatabase { db => + activateDatabase(db) { + withTempPath { dir => + val path = dir.getCanonicalPath + df.write.format("parquet").mode(SaveMode.Overwrite).save(path) + + sqlContext.createExternalTable("t", path, "parquet") + assert(sqlContext.tableNames(db).contains("t")) + checkAnswer(sqlContext.table("t"), df) + + sql( + s""" + |CREATE TABLE t1 + |USING parquet + |OPTIONS ( + | path '$path' + |) + """.stripMargin) + assert(sqlContext.tableNames(db).contains("t1")) + checkAnswer(sqlContext.table("t1"), df) + } + } + } + } + + test(s"createExternalTable() to non-default database - without USE") { + withTempDatabase { db => + withTempPath { dir => + val path = dir.getCanonicalPath + df.write.format("parquet").mode(SaveMode.Overwrite).save(path) + sqlContext.createExternalTable(s"$db.t", path, "parquet") + + assert(sqlContext.tableNames(db).contains("t")) + checkAnswer(sqlContext.table(s"$db.t"), df) + + sql( + s""" + |CREATE TABLE $db.t1 + |USING parquet + |OPTIONS ( + | path '$path' + |) + """.stripMargin) + assert(sqlContext.tableNames(db).contains("t1")) + checkAnswer(sqlContext.table(s"$db.t1"), df) + } } } @@ -60,6 +121,8 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils { assert(sqlContext.tableNames(db).contains("t")) checkAnswer(sqlContext.table(s"$db.t"), df.unionAll(df)) + + checkTablePath(db, "t") } } @@ -69,6 +132,8 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils { df.write.mode(SaveMode.Append).saveAsTable(s"$db.t") assert(sqlContext.tableNames(db).contains("t")) checkAnswer(sqlContext.table(s"$db.t"), df.unionAll(df)) + + checkTablePath(db, "t") } } @@ -131,7 +196,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils { } } - test("Refreshes a table in a non-default database") { + test("Refreshes a table in a non-default database - with USE") { import org.apache.spark.sql.functions.lit withTempDatabase { db => @@ -152,8 +217,94 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils { sql("ALTER TABLE t ADD PARTITION (p=1)") sql("REFRESH TABLE t") checkAnswer(sqlContext.table("t"), df.withColumn("p", lit(1))) + + df.write.parquet(s"$path/p=2") + sql("ALTER TABLE t ADD PARTITION (p=2)") + sqlContext.refreshTable("t") + checkAnswer( + sqlContext.table("t"), + df.withColumn("p", lit(1)).unionAll(df.withColumn("p", lit(2)))) } } } } + + test("Refreshes a table in a non-default database - without USE") { + import org.apache.spark.sql.functions.lit + + withTempDatabase { db => + withTempPath { dir => + val path = dir.getCanonicalPath + + sql( + s"""CREATE EXTERNAL TABLE $db.t (id BIGINT) + |PARTITIONED BY (p INT) + |STORED AS PARQUET + |LOCATION '$path' + """.stripMargin) + + checkAnswer(sqlContext.table(s"$db.t"), sqlContext.emptyDataFrame) + + df.write.parquet(s"$path/p=1") + sql(s"ALTER TABLE $db.t ADD PARTITION (p=1)") + sql(s"REFRESH TABLE $db.t") + checkAnswer(sqlContext.table(s"$db.t"), df.withColumn("p", lit(1))) + + df.write.parquet(s"$path/p=2") + sql(s"ALTER TABLE $db.t ADD PARTITION (p=2)") + sqlContext.refreshTable(s"$db.t") + checkAnswer( + sqlContext.table(s"$db.t"), + df.withColumn("p", lit(1)).unionAll(df.withColumn("p", lit(2)))) + } + } + } + + test("invalid database name and table names") { + { + val message = intercept[AnalysisException] { + df.write.format("parquet").saveAsTable("`d:b`.`t:a`") + }.getMessage + assert(message.contains("is not a valid name for metastore")) + } + + { + val message = intercept[AnalysisException] { + df.write.format("parquet").saveAsTable("`d:b`.`table`") + }.getMessage + assert(message.contains("is not a valid name for metastore")) + } + + withTempPath { dir => + val path = dir.getCanonicalPath + + { + val message = intercept[AnalysisException] { + sql( + s""" + |CREATE TABLE `d:b`.`t:a` (a int) + |USING parquet + |OPTIONS ( + | path '$path' + |) + """.stripMargin) + }.getMessage + assert(message.contains("is not a valid name for metastore")) + } + + { + val message = intercept[AnalysisException] { + sql( + s""" + |CREATE TABLE `d:b`.`table` (a int) + |USING parquet + |OPTIONS ( + | path '$path' + |) + """.stripMargin) + }.getMessage + assert(message.contains("is not a valid name for metastore")) + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala index bb5f1febe9ad..91d7a48208e8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala @@ -17,76 +17,141 @@ package org.apache.spark.sql.hive +import java.sql.Timestamp +import java.util.{Locale, TimeZone} + +import org.apache.hadoop.hive.conf.HiveConf +import org.scalatest.BeforeAndAfterAll + +import org.apache.spark.sql.execution.datasources.parquet.ParquetCompatibilityTest import org.apache.spark.sql.hive.test.TestHive -import org.apache.spark.sql.parquet.ParquetCompatibilityTest import org.apache.spark.sql.{Row, SQLConf, SQLContext} -class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest { - import ParquetCompatibilityTest.makeNullable - - override val sqlContext: SQLContext = TestHive - - override protected def beforeAll(): Unit = { - super.beforeAll() - - withSQLConf(HiveContext.CONVERT_METASTORE_PARQUET.key -> "false") { - withTempTable("data") { - sqlContext.sql( - s"""CREATE TABLE parquet_compat( - | bool_column BOOLEAN, - | byte_column TINYINT, - | short_column SMALLINT, - | int_column INT, - | long_column BIGINT, - | float_column FLOAT, - | double_column DOUBLE, - | - | strings_column ARRAY, - | int_to_string_column MAP - |) - |STORED AS PARQUET - |LOCATION '${parquetStore.getCanonicalPath}' - """.stripMargin) - - val schema = sqlContext.table("parquet_compat").schema - val rowRDD = sqlContext.sparkContext.parallelize(makeRows).coalesce(1) - sqlContext.createDataFrame(rowRDD, schema).registerTempTable("data") - sqlContext.sql("INSERT INTO TABLE parquet_compat SELECT * FROM data") - } - } +class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with BeforeAndAfterAll { + override def _sqlContext: SQLContext = TestHive + private val sqlContext = _sqlContext + + /** + * Set the staging directory (and hence path to ignore Parquet files under) + * to that set by [[HiveConf.ConfVars.STAGINGDIR]]. + */ + private val stagingDir = new HiveConf().getVar(HiveConf.ConfVars.STAGINGDIR) + + private val originalTimeZone = TimeZone.getDefault + private val originalLocale = Locale.getDefault + + protected override def beforeAll(): Unit = { + TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) + Locale.setDefault(Locale.US) } override protected def afterAll(): Unit = { - sqlContext.sql("DROP TABLE parquet_compat") + TimeZone.setDefault(originalTimeZone) + Locale.setDefault(originalLocale) } - test("Read Parquet file generated by parquet-hive") { + override protected def logParquetSchema(path: String): Unit = { + val schema = readParquetSchema(path, { path => + !path.getName.startsWith("_") && !path.getName.startsWith(stagingDir) + }) + logInfo( - s"""Schema of the Parquet file written by parquet-hive: - |${readParquetSchema(parquetStore.getCanonicalPath)} + s"""Schema of the Parquet file written by parquet-avro: + |$schema """.stripMargin) + } + + private def testParquetHiveCompatibility(row: Row, hiveTypes: String*): Unit = { + withTable("parquet_compat") { + withTempPath { dir => + val path = dir.getCanonicalPath + + // Hive columns are always nullable, so here we append a all-null row. + val rows = row :: Row(Seq.fill(row.length)(null): _*) :: Nil + + // Don't convert Hive metastore Parquet tables to let Hive write those Parquet files. + withSQLConf(HiveContext.CONVERT_METASTORE_PARQUET.key -> "false") { + withTempTable("data") { + val fields = hiveTypes.zipWithIndex.map { case (typ, index) => s" col_$index $typ" } + + val ddl = + s"""CREATE TABLE parquet_compat( + |${fields.mkString(",\n")} + |) + |STORED AS PARQUET + |LOCATION '$path' + """.stripMargin + + logInfo( + s"""Creating testing Parquet table with the following DDL: + |$ddl + """.stripMargin) + + sqlContext.sql(ddl) - // Unfortunately parquet-hive doesn't add `UTF8` annotation to BINARY when writing strings. - // Have to assume all BINARY values are strings here. - withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING.key -> "true") { - checkAnswer(sqlContext.read.parquet(parquetStore.getCanonicalPath), makeRows) + val schema = sqlContext.table("parquet_compat").schema + val rowRDD = sqlContext.sparkContext.parallelize(rows).coalesce(1) + sqlContext.createDataFrame(rowRDD, schema).registerTempTable("data") + sqlContext.sql("INSERT INTO TABLE parquet_compat SELECT * FROM data") + } + } + + logParquetSchema(path) + + // Unfortunately parquet-hive doesn't add `UTF8` annotation to BINARY when writing strings. + // Have to assume all BINARY values are strings here. + withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING.key -> "true") { + checkAnswer(sqlContext.read.parquet(path), rows) + } + } } } - def makeRows: Seq[Row] = { - (0 until 10).map { i => - def nullable[T <: AnyRef]: ( => T) => T = makeNullable[T](i) + test("simple primitives") { + testParquetHiveCompatibility( + Row(true, 1.toByte, 2.toShort, 3, 4.toLong, 5.1f, 6.1d, "foo"), + "BOOLEAN", "TINYINT", "SMALLINT", "INT", "BIGINT", "FLOAT", "DOUBLE", "STRING") + } + + test("SPARK-10177 timestamp") { + testParquetHiveCompatibility(Row(Timestamp.valueOf("2015-08-24 00:31:00")), "TIMESTAMP") + } + test("array") { + testParquetHiveCompatibility( Row( - nullable(i % 2 == 0: java.lang.Boolean), - nullable(i.toByte: java.lang.Byte), - nullable((i + 1).toShort: java.lang.Short), - nullable(i + 2: Integer), - nullable(i.toLong * 10: java.lang.Long), - nullable(i.toFloat + 0.1f: java.lang.Float), - nullable(i.toDouble + 0.2d: java.lang.Double), - nullable(Seq.tabulate(3)(n => s"arr_${i + n}")), - nullable(Seq.tabulate(3)(n => (i + n: Integer) -> s"val_${i + n}").toMap)) - } + Seq[Integer](1: Integer, null, 2: Integer, null), + Seq[String]("foo", null, "bar", null), + Seq[Seq[Integer]]( + Seq[Integer](1: Integer, null), + Seq[Integer](2: Integer, null))), + "ARRAY", + "ARRAY", + "ARRAY>") + } + + test("map") { + testParquetHiveCompatibility( + Row( + Map[Integer, String]( + (1: Integer) -> "foo", + (2: Integer) -> null)), + "MAP") + } + + // HIVE-11625: Parquet map entries with null keys are dropped by Hive + ignore("map entries with null keys") { + testParquetHiveCompatibility( + Row( + Map[Integer, String]( + null.asInstanceOf[Integer] -> "bar", + null.asInstanceOf[Integer] -> null)), + "MAP") + } + + test("struct") { + testParquetHiveCompatibility( + Row(Row(1, Seq("foo", "bar", null))), + "STRUCT>") } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala index 017bc2adc103..1cc8a93e8341 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala @@ -18,50 +18,54 @@ package org.apache.spark.sql.hive import com.google.common.io.Files +import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.util.Utils -class QueryPartitionSuite extends QueryTest { +class QueryPartitionSuite extends QueryTest with SQLTestUtils { private lazy val ctx = org.apache.spark.sql.hive.test.TestHive import ctx.implicits._ - import ctx.sql + + protected def _sqlContext = ctx test("SPARK-5068: query data when path doesn't exist"){ - val testData = ctx.sparkContext.parallelize( - (1 to 10).map(i => TestData(i, i.toString))).toDF() - testData.registerTempTable("testData") + withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) { + val testData = ctx.sparkContext.parallelize( + (1 to 10).map(i => TestData(i, i.toString))).toDF() + testData.registerTempTable("testData") - val tmpDir = Files.createTempDir() - // create the table for test - sql(s"CREATE TABLE table_with_partition(key int,value string) " + - s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") - sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + - "SELECT key,value FROM testData") - sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + - "SELECT key,value FROM testData") - sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + - "SELECT key,value FROM testData") - sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + - "SELECT key,value FROM testData") + val tmpDir = Files.createTempDir() + // create the table for test + sql(s"CREATE TABLE table_with_partition(key int,value string) " + + s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + + "SELECT key,value FROM testData") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + + "SELECT key,value FROM testData") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + + "SELECT key,value FROM testData") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + + "SELECT key,value FROM testData") - // test for the exist path - checkAnswer(sql("select key,value from table_with_partition"), - testData.toDF.collect ++ testData.toDF.collect - ++ testData.toDF.collect ++ testData.toDF.collect) + // test for the exist path + checkAnswer(sql("select key,value from table_with_partition"), + testData.toDF.collect ++ testData.toDF.collect + ++ testData.toDF.collect ++ testData.toDF.collect) - // delete the path of one partition - tmpDir.listFiles - .find { f => f.isDirectory && f.getName().startsWith("ds=") } - .foreach { f => Utils.deleteRecursively(f) } + // delete the path of one partition + tmpDir.listFiles + .find { f => f.isDirectory && f.getName().startsWith("ds=") } + .foreach { f => Utils.deleteRecursively(f) } - // test for after delete the path - checkAnswer(sql("select key,value from table_with_partition"), - testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) + // test for after delete the path + checkAnswer(sql("select key,value from table_with_partition"), + testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) - sql("DROP TABLE table_with_partition") - sql("DROP TABLE createAndInsertTest") + sql("DROP TABLE table_with_partition") + sql("DROP TABLE createAndInsertTest") + } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index bc72b0172a46..e4fec7e2c8a2 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -54,6 +54,9 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll { } } + // Ensure session state is initialized. + ctx.parseSql("use default") + assertAnalyzeCommand( "ANALYZE TABLE Table1 COMPUTE STATISTICS", classOf[HiveNativeCommand]) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala index 9b3ede43ee2d..7ee1c8d13aa3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala @@ -17,14 +17,12 @@ package org.apache.spark.sql.hive -import org.apache.spark.sql.{Row, QueryTest} +import org.apache.spark.sql.QueryTest case class FunctionResult(f1: String, f2: String) class UDFSuite extends QueryTest { - private lazy val ctx = org.apache.spark.sql.hive.test.TestHive - import ctx.implicits._ test("UDF case insensitive") { ctx.udf.register("random0", () => { Math.random() }) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala index 3eb127e23d48..f0bb77092c0c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.client import java.io.File +import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.sql.catalyst.expressions.{NamedExpression, Literal, AttributeReference, EqualTo} import org.apache.spark.sql.catalyst.util.quietly @@ -48,7 +49,9 @@ class VersionsSuite extends SparkFunSuite with Logging { } test("success sanity check") { - val badClient = IsolatedClientLoader.forVersion("13", buildConf(), ivyPath).client + val badClient = IsolatedClientLoader.forVersion(HiveContext.hiveExecutionVersion, + buildConf(), + ivyPath).client val db = new HiveDatabase("default", "") badClient.createDatabase(db) } @@ -91,6 +94,7 @@ class VersionsSuite extends SparkFunSuite with Logging { versions.foreach { version => test(s"$version: create client") { client = null + System.gc() // Hack to avoid SEGV on some JVM versions. client = IsolatedClientLoader.forVersion(version, buildConf(), ivyPath).client } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala index 6f0db27775e4..4886a8594836 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala @@ -17,17 +17,18 @@ package org.apache.spark.sql.hive.execution +import org.scalatest.BeforeAndAfterAll + +import org.apache.spark.sql._ import org.apache.spark.sql.execution.aggregate import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} -import org.apache.spark.sql.{SQLConf, AnalysisException, QueryTest, Row} -import org.scalatest.BeforeAndAfterAll -import test.org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum} +import org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum} abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll { - - override val sqlContext = TestHive + override def _sqlContext: SQLContext = TestHive + protected val sqlContext = _sqlContext import sqlContext.implicits._ var originalUseAggregate2: Boolean = _ @@ -73,8 +74,8 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be emptyDF.registerTempTable("emptyTable") // Register UDAFs - sqlContext.udaf.register("mydoublesum", new MyDoubleSum) - sqlContext.udaf.register("mydoubleavg", new MyDoubleAvg) + sqlContext.udf.register("mydoublesum", new MyDoubleSum) + sqlContext.udf.register("mydoubleavg", new MyDoubleAvg) } override def afterAll(): Unit = { @@ -141,6 +142,22 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be Nil) } + test("null literal") { + checkAnswer( + sqlContext.sql( + """ + |SELECT + | AVG(null), + | COUNT(null), + | FIRST(null), + | LAST(null), + | MAX(null), + | MIN(null), + | SUM(null) + """.stripMargin), + Row(null, 0, null, null, null, null, null) :: Nil) + } + test("only do grouping") { checkAnswer( sqlContext.sql( @@ -266,13 +283,6 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be |SELECT avg(value) FROM agg1 """.stripMargin), Row(11.125) :: Nil) - - checkAnswer( - sqlContext.sql( - """ - |SELECT avg(null) - """.stripMargin), - Row(null) :: Nil) } test("udaf") { @@ -364,7 +374,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be | max(distinct value1) |FROM agg2 """.stripMargin), - Row(-60, 70.0, 101.0/9.0, 5.6, 100.0)) + Row(-60, 70.0, 101.0/9.0, 5.6, 100)) checkAnswer( sqlContext.sql( @@ -402,6 +412,23 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be Row(2, 100.0, 3.0, 0.0, 100.0, 1.0/3.0 + 100.0) :: Row(3, null, 3.0, null, null, null) :: Row(null, 110.0, 60.0, 30.0, 110.0, 110.0) :: Nil) + + checkAnswer( + sqlContext.sql( + """ + |SELECT + | count(value1), + | count(*), + | count(1), + | count(DISTINCT value1), + | key + |FROM agg2 + |GROUP BY key + """.stripMargin), + Row(3, 3, 3, 2, 1) :: + Row(3, 4, 4, 2, 2) :: + Row(0, 2, 2, 0, 3) :: + Row(3, 4, 4, 3, null) :: Nil) } test("test count") { @@ -453,6 +480,21 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be Row(0, null, 1, 1, null, 0) :: Nil) } + test("test Last implemented based on AggregateExpression1") { + // TODO: Remove this test once we remove AggregateExpression1. + import org.apache.spark.sql.functions._ + val df = Seq((1, 1), (2, 2), (3, 3)).toDF("i", "j").repartition(1) + withSQLConf( + SQLConf.SHUFFLE_PARTITIONS.key -> "1", + SQLConf.USE_SQL_AGGREGATE2.key -> "false") { + + checkAnswer( + df.groupBy("i").agg(last("j")), + df + ) + } + } + test("error handling") { withSQLConf("spark.sql.useAggregate2" -> "false") { val errorMessage = intercept[AnalysisException] { @@ -496,7 +538,8 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be |FROM agg1 |GROUP BY key """.stripMargin).queryExecution.executedPlan.collect { - case agg: aggregate.Aggregate => agg + case agg: aggregate.SortBasedAggregate => agg + case agg: aggregate.TungstenAggregate => agg } val message = "We should fallback to the old aggregation code path if " + @@ -537,3 +580,58 @@ class TungstenAggregationQuerySuite extends AggregationQuerySuite { sqlContext.setConf(SQLConf.UNSAFE_ENABLED.key, originalUnsafeEnabled.toString) } } + +class TungstenAggregationQueryWithControlledFallbackSuite extends AggregationQuerySuite { + + var originalUnsafeEnabled: Boolean = _ + + override def beforeAll(): Unit = { + originalUnsafeEnabled = sqlContext.conf.unsafeEnabled + sqlContext.setConf(SQLConf.UNSAFE_ENABLED.key, "true") + super.beforeAll() + } + + override def afterAll(): Unit = { + super.afterAll() + sqlContext.setConf(SQLConf.UNSAFE_ENABLED.key, originalUnsafeEnabled.toString) + sqlContext.conf.unsetConf("spark.sql.TungstenAggregate.testFallbackStartsAt") + } + + override protected def checkAnswer(actual: DataFrame, expectedAnswer: Seq[Row]): Unit = { + (0 to 2).foreach { fallbackStartsAt => + sqlContext.setConf( + "spark.sql.TungstenAggregate.testFallbackStartsAt", + fallbackStartsAt.toString) + + // Create a new df to make sure its physical operator picks up + // spark.sql.TungstenAggregate.testFallbackStartsAt. + val newActual = DataFrame(sqlContext, actual.logicalPlan) + + QueryTest.checkAnswer(newActual, expectedAnswer) match { + case Some(errorMessage) => + val newErrorMessage = + s""" + |The following aggregation query failed when using TungstenAggregate with + |controlled fallback (it falls back to sort-based aggregation once it has processed + |$fallbackStartsAt input rows). The query is + |${actual.queryExecution} + | + |$errorMessage + """.stripMargin + + fail(newErrorMessage) + case None => + } + } + } + + // Override it to make sure we call the actually overridden checkAnswer. + override protected def checkAnswer(df: DataFrame, expectedAnswer: Row): Unit = { + checkAnswer(df, Seq(expectedAnswer)) + } + + // Override it to make sure we call the actually overridden checkAnswer. + override protected def checkAnswer(df: DataFrame, expectedAnswer: DataFrame): Unit = { + checkAnswer(df, expectedAnswer.collect()) + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala index 638b9c810372..4d45249d9c6b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.hive.execution import java.io._ +import scala.util.control.NonFatal + import org.scalatest.{BeforeAndAfterAll, GivenWhenThen} import org.apache.spark.{Logging, SparkFunSuite} @@ -124,7 +126,7 @@ abstract class HiveComparisonTest protected val cacheDigest = java.security.MessageDigest.getInstance("MD5") protected def getMd5(str: String): String = { val digest = java.security.MessageDigest.getInstance("MD5") - digest.update(str.getBytes("utf-8")) + digest.update(str.replaceAll(System.lineSeparator(), "\n").getBytes("utf-8")) new java.math.BigInteger(1, digest.digest).toString(16) } @@ -386,11 +388,45 @@ abstract class HiveComparisonTest hiveCacheFiles.foreach(_.delete()) } + // If this query is reading other tables that were created during this test run + // also print out the query plans and results for those. + val computedTablesMessages: String = try { + val tablesRead = new TestHive.QueryExecution(query).executedPlan.collect { + case ts: HiveTableScan => ts.relation.tableName + }.toSet + + TestHive.reset() + val executions = queryList.map(new TestHive.QueryExecution(_)) + executions.foreach(_.toRdd) + val tablesGenerated = queryList.zip(executions).flatMap { + case (q, e) => e.executedPlan.collect { + case i: InsertIntoHiveTable if tablesRead contains i.table.tableName => + (q, e, i) + } + } + + tablesGenerated.map { case (hiveql, execution, insert) => + s""" + |=== Generated Table === + |$hiveql + |$execution + |== Results == + |${insert.child.execute().collect().mkString("\n")} + """.stripMargin + }.mkString("\n") + + } catch { + case NonFatal(e) => + logError("Failed to compute generated tables", e) + s"Couldn't compute dependent tables: $e" + } + val errorMessage = s""" |Results do not match for $testCaseName: |$hiveQuery\n${hiveQuery.analyzed.output.map(_.name).mkString("\t")} |$resultComparison + |$computedTablesMessages """.stripMargin stringToFile(new File(wrongDirectory, testCaseName), errorMessage + consoleTestCase) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala index 697211222b90..11d7a872dff0 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala @@ -17,13 +17,18 @@ package org.apache.spark.sql.hive.execution -import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.{SQLContext, QueryTest} +import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ +import org.apache.spark.sql.test.SQLTestUtils /** * A set of tests that validates support for Hive Explain command. */ -class HiveExplainSuite extends QueryTest { +class HiveExplainSuite extends QueryTest with SQLTestUtils { + override def _sqlContext: SQLContext = TestHive + private val sqlContext = _sqlContext + test("explain extended command") { checkExistence(sql(" explain select * from src where key=123 "), true, "== Physical Plan ==") @@ -36,7 +41,7 @@ class HiveExplainSuite extends QueryTest { "== Analyzed Logical Plan ==", "== Optimized Logical Plan ==", "== Physical Plan ==", - "Code Generation", "== RDD ==") + "Code Generation") } test("explain create table command") { @@ -74,4 +79,30 @@ class HiveExplainSuite extends QueryTest { "Limit", "src") } + + test("SPARK-6212: The EXPLAIN output of CTAS only shows the analyzed plan") { + withTempTable("jt") { + val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}""")) + read.json(rdd).registerTempTable("jt") + val outputs = sql( + s""" + |EXPLAIN EXTENDED + |CREATE TABLE t1 + |AS + |SELECT * FROM jt + """.stripMargin).collect().map(_.mkString).mkString + + val shouldContain = + "== Parsed Logical Plan ==" :: "== Analyzed Logical Plan ==" :: "Subquery" :: + "== Optimized Logical Plan ==" :: "== Physical Plan ==" :: + "CreateTableAsSelect" :: "InsertIntoHiveTable" :: "jt" :: Nil + for (key <- shouldContain) { + assert(outputs.contains(key), s"$key doesn't exist in result") + } + + val physicalIndex = outputs.indexOf("== Physical Plan ==") + assert(!outputs.substring(physicalIndex).contains("Subquery"), + "Physical Plan should not contain Subquery since it's eliminated by optimizer") + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 11a843becce6..83f9f3eaa3a5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -52,14 +52,6 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) // Add Locale setting Locale.setDefault(Locale.US) - sql(s"ADD JAR ${TestHive.getHiveFile("TestUDTF.jar").getCanonicalPath()}") - // The function source code can be found at: - // https://cwiki.apache.org/confluence/display/Hive/DeveloperGuide+UDTF - sql( - """ - |CREATE TEMPORARY FUNCTION udtf_count2 - |AS 'org.apache.spark.sql.hive.execution.GenericUDTFCount2' - """.stripMargin) } override def afterAll() { @@ -69,15 +61,6 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { sql("DROP TEMPORARY FUNCTION udtf_count2") } - createQueryTest("Test UDTF.close in Lateral Views", - """ - |SELECT key, cc - |FROM src LATERAL VIEW udtf_count2(value) dd AS cc - """.stripMargin, false) // false mean we have to keep the temp function in registry - - createQueryTest("Test UDTF.close in SELECT", - "SELECT udtf_count2(a) FROM (SELECT 1 AS a FROM src LIMIT 3) table", false) - test("SPARK-4908: concurrent hive native commands") { (1 to 100).par.map { _ => sql("USE default") @@ -176,8 +159,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { createQueryTest("! operator", """ |SELECT a FROM ( - | SELECT 1 AS a FROM src LIMIT 1 UNION ALL - | SELECT 2 AS a FROM src LIMIT 1) table + | SELECT 1 AS a UNION ALL SELECT 2 AS a) t |WHERE !(a>1) """.stripMargin) @@ -229,71 +211,6 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { |FROM src LIMIT 1; """.stripMargin) - createQueryTest("count distinct 0 values", - """ - |SELECT COUNT(DISTINCT a) FROM ( - | SELECT 'a' AS a FROM src LIMIT 0) table - """.stripMargin) - - createQueryTest("count distinct 1 value strings", - """ - |SELECT COUNT(DISTINCT a) FROM ( - | SELECT 'a' AS a FROM src LIMIT 1 UNION ALL - | SELECT 'b' AS a FROM src LIMIT 1) table - """.stripMargin) - - createQueryTest("count distinct 1 value", - """ - |SELECT COUNT(DISTINCT a) FROM ( - | SELECT 1 AS a FROM src LIMIT 1 UNION ALL - | SELECT 1 AS a FROM src LIMIT 1) table - """.stripMargin) - - createQueryTest("count distinct 2 values", - """ - |SELECT COUNT(DISTINCT a) FROM ( - | SELECT 1 AS a FROM src LIMIT 1 UNION ALL - | SELECT 2 AS a FROM src LIMIT 1) table - """.stripMargin) - - createQueryTest("count distinct 2 values including null", - """ - |SELECT COUNT(DISTINCT a, 1) FROM ( - | SELECT 1 AS a FROM src LIMIT 1 UNION ALL - | SELECT 1 AS a FROM src LIMIT 1 UNION ALL - | SELECT null AS a FROM src LIMIT 1) table - """.stripMargin) - - createQueryTest("count distinct 1 value + null", - """ - |SELECT COUNT(DISTINCT a) FROM ( - | SELECT 1 AS a FROM src LIMIT 1 UNION ALL - | SELECT 1 AS a FROM src LIMIT 1 UNION ALL - | SELECT null AS a FROM src LIMIT 1) table - """.stripMargin) - - createQueryTest("count distinct 1 value long", - """ - |SELECT COUNT(DISTINCT a) FROM ( - | SELECT 1L AS a FROM src LIMIT 1 UNION ALL - | SELECT 1L AS a FROM src LIMIT 1) table - """.stripMargin) - - createQueryTest("count distinct 2 values long", - """ - |SELECT COUNT(DISTINCT a) FROM ( - | SELECT 1L AS a FROM src LIMIT 1 UNION ALL - | SELECT 2L AS a FROM src LIMIT 1) table - """.stripMargin) - - createQueryTest("count distinct 1 value + null long", - """ - |SELECT COUNT(DISTINCT a) FROM ( - | SELECT 1L AS a FROM src LIMIT 1 UNION ALL - | SELECT 1L AS a FROM src LIMIT 1 UNION ALL - | SELECT null AS a FROM src LIMIT 1) table - """.stripMargin) - createQueryTest("null case", "SELECT case when(true) then 1 else null end FROM src LIMIT 1") @@ -510,7 +427,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { |'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |USING 'cat' AS (tKey, tValue) ROW FORMAT SERDE |'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' FROM src; - """.stripMargin.replaceAll("\n", " ")) + """.stripMargin.replaceAll(System.lineSeparator(), " ")) test("transform with SerDe2") { @@ -529,7 +446,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { |('avro.schema.literal'='{"namespace": "testing.hive.avro.serde","name": |"src","type": "record","fields": [{"name":"key","type":"int"}]}') |FROM small_src - """.stripMargin.replaceAll("\n", " ")).collect().head + """.stripMargin.replaceAll(System.lineSeparator(), " ")).collect().head assert(expected(0) === res(0)) } @@ -541,7 +458,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { |('serialization.last.column.takes.rest'='true') USING 'cat' AS (tKey, tValue) |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |WITH SERDEPROPERTIES ('serialization.last.column.takes.rest'='true') FROM src; - """.stripMargin.replaceAll("\n", " ")) + """.stripMargin.replaceAll(System.lineSeparator(), " ")) createQueryTest("transform with SerDe4", """ @@ -550,7 +467,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { |('serialization.last.column.takes.rest'='true') USING 'cat' ROW FORMAT SERDE |'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES |('serialization.last.column.takes.rest'='true') FROM src; - """.stripMargin.replaceAll("\n", " ")) + """.stripMargin.replaceAll(System.lineSeparator(), " ")) createQueryTest("LIKE", "SELECT * FROM src WHERE value LIKE '%1%'") @@ -670,11 +587,62 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { |select * where key = 4 """.stripMargin) + // test get_json_object again Hive, because the HiveCompatabilitySuite cannot handle result + // with newline in it. + createQueryTest("get_json_object #1", + "SELECT get_json_object(src_json.json, '$') FROM src_json") + + createQueryTest("get_json_object #2", + "SELECT get_json_object(src_json.json, '$.owner'), get_json_object(src_json.json, '$.store')" + + " FROM src_json") + + createQueryTest("get_json_object #3", + "SELECT get_json_object(src_json.json, '$.store.bicycle'), " + + "get_json_object(src_json.json, '$.store.book') FROM src_json") + + createQueryTest("get_json_object #4", + "SELECT get_json_object(src_json.json, '$.store.book[0]'), " + + "get_json_object(src_json.json, '$.store.book[*]') FROM src_json") + + createQueryTest("get_json_object #5", + "SELECT get_json_object(src_json.json, '$.store.book[0].category'), " + + "get_json_object(src_json.json, '$.store.book[*].category'), " + + "get_json_object(src_json.json, '$.store.book[*].isbn'), " + + "get_json_object(src_json.json, '$.store.book[*].reader') FROM src_json") + + createQueryTest("get_json_object #6", + "SELECT get_json_object(src_json.json, '$.store.book[*].reader[0].age'), " + + "get_json_object(src_json.json, '$.store.book[*].reader[*].age') FROM src_json") + + createQueryTest("get_json_object #7", + "SELECT get_json_object(src_json.json, '$.store.basket[0][1]'), " + + "get_json_object(src_json.json, '$.store.basket[*]'), " + + // Hive returns wrong result with [*][0], so this expression is change to make test pass + "get_json_object(src_json.json, '$.store.basket[0][0]'), " + + "get_json_object(src_json.json, '$.store.basket[0][*]'), " + + "get_json_object(src_json.json, '$.store.basket[*][*]'), " + + "get_json_object(src_json.json, '$.store.basket[0][2].b'), " + + "get_json_object(src_json.json, '$.store.basket[0][*].b') FROM src_json") + + createQueryTest("get_json_object #8", + "SELECT get_json_object(src_json.json, '$.non_exist_key'), " + + "get_json_object(src_json.json, '$..no_recursive'), " + + "get_json_object(src_json.json, '$.store.book[10]'), " + + "get_json_object(src_json.json, '$.store.book[0].non_exist_key'), " + + "get_json_object(src_json.json, '$.store.basket[*].non_exist_key'), " + + "get_json_object(src_json.json, '$.store.basket[0][*].non_exist_key') FROM src_json") + + createQueryTest("get_json_object #9", + "SELECT get_json_object(src_json.json, '$.zip code') FROM src_json") + + createQueryTest("get_json_object #10", + "SELECT get_json_object(src_json.json, '$.fb:testid') FROM src_json") + test("predicates contains an empty AttributeSet() references") { sql( """ |SELECT a FROM ( - | SELECT 1 AS a FROM src LIMIT 1 ) table + | SELECT 1 AS a FROM src LIMIT 1 ) t |WHERE abs(20141202) is not null """.stripMargin).collect() } @@ -987,7 +955,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { .zip(parts) .map { case (k, v) => if (v == "NULL") { - s"$k=${ConfVars.DEFAULTPARTITIONNAME.defaultVal}" + s"$k=${ConfVars.DEFAULTPARTITIONNAME.defaultStrVal}" } else { s"$k=$v" } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala index 7069afc9f7da..b03a35132325 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala @@ -182,7 +182,7 @@ class HiveUDFSuite extends QueryTest { val errMsg = intercept[AnalysisException] { sql("SELECT testUDFToListString(s) FROM inputTable") } - assert(errMsg.getMessage === "List type in java is unsupported because " + + assert(errMsg.getMessage contains "List type in java is unsupported because " + "JVM type erasure makes spark fail to catch a component type in List<>;") sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToListString") @@ -197,7 +197,7 @@ class HiveUDFSuite extends QueryTest { val errMsg = intercept[AnalysisException] { sql("SELECT testUDFToListInt(s) FROM inputTable") } - assert(errMsg.getMessage === "List type in java is unsupported because " + + assert(errMsg.getMessage contains "List type in java is unsupported because " + "JVM type erasure makes spark fail to catch a component type in List<>;") sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToListInt") @@ -213,7 +213,7 @@ class HiveUDFSuite extends QueryTest { val errMsg = intercept[AnalysisException] { sql("SELECT testUDFToStringIntMap(s) FROM inputTable") } - assert(errMsg.getMessage === "Map type in java is unsupported because " + + assert(errMsg.getMessage contains "Map type in java is unsupported because " + "JVM type erasure makes spark fail to catch key and value types in Map<>;") sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToStringIntMap") @@ -229,7 +229,7 @@ class HiveUDFSuite extends QueryTest { val errMsg = intercept[AnalysisException] { sql("SELECT testUDFToIntIntMap(s) FROM inputTable") } - assert(errMsg.getMessage === "Map type in java is unsupported because " + + assert(errMsg.getMessage contains "Map type in java is unsupported because " + "JVM type erasure makes spark fail to catch key and value types in Map<>;") sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToIntIntMap") @@ -276,6 +276,11 @@ class HiveUDFSuite extends QueryTest { checkAnswer( sql("SELECT testStringStringUDF(\"hello\", s) FROM stringTable"), Seq(Row("hello world"), Row("hello goodbye"))) + + checkAnswer( + sql("SELECT testStringStringUDF(\"\", testStringStringUDF(\"hello\", s)) FROM stringTable"), + Seq(Row(" hello world"), Row(" hello goodbye"))) + sql("DROP TEMPORARY FUNCTION IF EXISTS testStringStringUDF") TestHive.reset() diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala index e83a7dc77e32..3bf8f3ac2048 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala @@ -82,16 +82,16 @@ class PruningSuite extends HiveComparisonTest with BeforeAndAfter { Seq.empty) createPruningTest("Column pruning - non-trivial top project with aliases", - "SELECT c1 * 2 AS double FROM (SELECT key AS c1 FROM src WHERE key > 10) t1 LIMIT 3", - Seq("double"), + "SELECT c1 * 2 AS dbl FROM (SELECT key AS c1 FROM src WHERE key > 10) t1 LIMIT 3", + Seq("dbl"), Seq("key"), Seq.empty) // Partition pruning tests createPruningTest("Partition pruning - non-partitioned, non-trivial project", - "SELECT key * 2 AS double FROM src WHERE value IS NOT NULL", - Seq("double"), + "SELECT key * 2 AS dbl FROM src WHERE value IS NOT NULL", + Seq("dbl"), Seq("key", "value"), Seq.empty) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index c4923d83e48f..55ecbd5b5f21 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -30,9 +30,10 @@ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ import org.apache.spark.sql.hive.{HiveContext, HiveQLDialect, MetastoreRelation} -import org.apache.spark.sql.parquet.ParquetRelation +import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval case class Nested1(f1: Nested2) case class Nested2(f2: Nested3) @@ -65,7 +66,27 @@ class MyDialect extends DefaultParserDialect * valid, but Hive currently cannot execute it. */ class SQLQuerySuite extends QueryTest with SQLTestUtils { - override def sqlContext: SQLContext = TestHive + override def _sqlContext: SQLContext = TestHive + private val sqlContext = _sqlContext + + test("UDTF") { + sql(s"ADD JAR ${TestHive.getHiveFile("TestUDTF.jar").getCanonicalPath()}") + // The function source code can be found at: + // https://cwiki.apache.org/confluence/display/Hive/DeveloperGuide+UDTF + sql( + """ + |CREATE TEMPORARY FUNCTION udtf_count2 + |AS 'org.apache.spark.sql.hive.execution.GenericUDTFCount2' + """.stripMargin) + + checkAnswer( + sql("SELECT key, cc FROM src LATERAL VIEW udtf_count2(value) dd AS cc"), + Row(97, 500) :: Row(97, 500) :: Nil) + + checkAnswer( + sql("SELECT udtf_count2(a) FROM (SELECT 1 AS a FROM src LIMIT 3) t"), + Row(3) :: Row(3) :: Nil) + } test("SPARK-6835: udtf in lateral view") { val df = Seq((1, 1)).toDF("c1", "c2") @@ -264,47 +285,51 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils { setConf(HiveContext.CONVERT_CTAS, true) - sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value") - sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value") - var message = intercept[AnalysisException] { + try { sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value") - }.getMessage - assert(message.contains("ctas1 already exists")) - checkRelation("ctas1", true) - sql("DROP TABLE ctas1") - - // Specifying database name for query can be converted to data source write path - // is not allowed right now. - message = intercept[AnalysisException] { - sql("CREATE TABLE default.ctas1 AS SELECT key k, value FROM src ORDER BY k, value") - }.getMessage - assert( - message.contains("Cannot specify database name in a CTAS statement"), - "When spark.sql.hive.convertCTAS is true, we should not allow " + - "database name specified.") - - sql("CREATE TABLE ctas1 stored as textfile AS SELECT key k, value FROM src ORDER BY k, value") - checkRelation("ctas1", true) - sql("DROP TABLE ctas1") - - sql( - "CREATE TABLE ctas1 stored as sequencefile AS SELECT key k, value FROM src ORDER BY k, value") - checkRelation("ctas1", true) - sql("DROP TABLE ctas1") - - sql("CREATE TABLE ctas1 stored as rcfile AS SELECT key k, value FROM src ORDER BY k, value") - checkRelation("ctas1", false) - sql("DROP TABLE ctas1") - - sql("CREATE TABLE ctas1 stored as orc AS SELECT key k, value FROM src ORDER BY k, value") - checkRelation("ctas1", false) - sql("DROP TABLE ctas1") - - sql("CREATE TABLE ctas1 stored as parquet AS SELECT key k, value FROM src ORDER BY k, value") - checkRelation("ctas1", false) - sql("DROP TABLE ctas1") - - setConf(HiveContext.CONVERT_CTAS, originalConf) + sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value") + var message = intercept[AnalysisException] { + sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value") + }.getMessage + assert(message.contains("ctas1 already exists")) + checkRelation("ctas1", true) + sql("DROP TABLE ctas1") + + // Specifying database name for query can be converted to data source write path + // is not allowed right now. + message = intercept[AnalysisException] { + sql("CREATE TABLE default.ctas1 AS SELECT key k, value FROM src ORDER BY k, value") + }.getMessage + assert( + message.contains("Cannot specify database name in a CTAS statement"), + "When spark.sql.hive.convertCTAS is true, we should not allow " + + "database name specified.") + + sql("CREATE TABLE ctas1 stored as textfile" + + " AS SELECT key k, value FROM src ORDER BY k, value") + checkRelation("ctas1", true) + sql("DROP TABLE ctas1") + + sql("CREATE TABLE ctas1 stored as sequencefile" + + " AS SELECT key k, value FROM src ORDER BY k, value") + checkRelation("ctas1", true) + sql("DROP TABLE ctas1") + + sql("CREATE TABLE ctas1 stored as rcfile AS SELECT key k, value FROM src ORDER BY k, value") + checkRelation("ctas1", false) + sql("DROP TABLE ctas1") + + sql("CREATE TABLE ctas1 stored as orc AS SELECT key k, value FROM src ORDER BY k, value") + checkRelation("ctas1", false) + sql("DROP TABLE ctas1") + + sql("CREATE TABLE ctas1 stored as parquet AS SELECT key k, value FROM src ORDER BY k, value") + checkRelation("ctas1", false) + sql("DROP TABLE ctas1") + } finally { + setConf(HiveContext.CONVERT_CTAS, originalConf) + sql("DROP TABLE IF EXISTS ctas1") + } } test("SQL Dialect Switching") { @@ -637,7 +662,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils { test("resolve udtf in projection #2") { val rdd = sparkContext.makeRDD((1 to 2).map(i => s"""{"a":[$i, ${i + 1}]}""")) - jsonRDD(rdd).registerTempTable("data") + read.json(rdd).registerTempTable("data") checkAnswer(sql("SELECT explode(map(1, 1)) FROM data LIMIT 1"), Row(1, 1) :: Nil) checkAnswer(sql("SELECT explode(map(1, 1)) as (k1, k2) FROM data LIMIT 1"), Row(1, 1) :: Nil) intercept[AnalysisException] { @@ -652,7 +677,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils { // TGF with non-TGF in project is allowed in Spark SQL, but not in Hive test("TGF with non-TGF in projection") { val rdd = sparkContext.makeRDD( """{"a": "1", "b":"1"}""" :: Nil) - jsonRDD(rdd).registerTempTable("data") + read.json(rdd).registerTempTable("data") checkAnswer( sql("SELECT explode(map(a, b)) as (k1, k2), a, b FROM data"), Row("1", "1", "1", "1") :: Nil) @@ -670,22 +695,25 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils { val originalConf = convertCTAS setConf(HiveContext.CONVERT_CTAS, false) - sql("CREATE TABLE explodeTest (key bigInt)") - table("explodeTest").queryExecution.analyzed match { - case metastoreRelation: MetastoreRelation => // OK - case _ => - fail("To correctly test the fix of SPARK-5875, explodeTest should be a MetastoreRelation") - } + try { + sql("CREATE TABLE explodeTest (key bigInt)") + table("explodeTest").queryExecution.analyzed match { + case metastoreRelation: MetastoreRelation => // OK + case _ => + fail("To correctly test the fix of SPARK-5875, explodeTest should be a MetastoreRelation") + } - sql(s"INSERT OVERWRITE TABLE explodeTest SELECT explode(a) AS val FROM data") - checkAnswer( - sql("SELECT key from explodeTest"), - (1 to 5).flatMap(i => Row(i) :: Row(i + 1) :: Nil) - ) + sql(s"INSERT OVERWRITE TABLE explodeTest SELECT explode(a) AS val FROM data") + checkAnswer( + sql("SELECT key from explodeTest"), + (1 to 5).flatMap(i => Row(i) :: Row(i + 1) :: Nil) + ) - sql("DROP TABLE explodeTest") - dropTempTable("data") - setConf(HiveContext.CONVERT_CTAS, originalConf) + sql("DROP TABLE explodeTest") + dropTempTable("data") + } finally { + setConf(HiveContext.CONVERT_CTAS, originalConf) + } } test("sanity test for SPARK-6618") { @@ -725,6 +753,16 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils { .queryExecution.toRdd.count()) } + test("test script transform data type") { + val data = (1 to 5).map { i => (i, i) } + data.toDF("key", "value").registerTempTable("test") + checkAnswer( + sql("""FROM + |(FROM test SELECT TRANSFORM(key, value) USING 'cat' AS (thing1 int, thing2 string)) t + |SELECT thing1 + 1 + """.stripMargin), (2 to 6).map(i => Row(i))) + } + test("window function: udaf with aggregate expressin") { val data = Seq( WindowData(1, "a", 5), @@ -912,6 +950,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils { } test("SPARK-7595: Window will cause resolve failed with self join") { + sql("SELECT * FROM src") // Force loading of src table. + checkAnswer(sql( """ |with @@ -1058,12 +1098,12 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils { test("SPARK-8588 HiveTypeCoercion.inConversion fires too early") { val df = TestHive.createDataFrame(Seq((1, "2014-01-01"), (2, "2015-01-01"), (3, "2016-01-01"))) - df.toDF("id", "date").registerTempTable("test_SPARK8588") + df.toDF("id", "datef").registerTempTable("test_SPARK8588") checkAnswer( TestHive.sql( """ - |select id, concat(year(date)) - |from test_SPARK8588 where concat(year(date), ' year') in ('2015 year', '2014 year') + |select id, concat(year(datef)) + |from test_SPARK8588 where concat(year(datef), ' year') in ('2015 year', '2014 year') """.stripMargin), Row(1, "2014") :: Row(2, "2015") :: Nil ) @@ -1077,4 +1117,60 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils { checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1)) } + + test("Convert hive interval term into Literal of CalendarIntervalType") { + checkAnswer(sql("select interval '10-9' year to month"), + Row(CalendarInterval.fromString("interval 10 years 9 months"))) + checkAnswer(sql("select interval '20 15:40:32.99899999' day to second"), + Row(CalendarInterval.fromString("interval 2 weeks 6 days 15 hours 40 minutes " + + "32 seconds 99 milliseconds 899 microseconds"))) + checkAnswer(sql("select interval '30' year"), + Row(CalendarInterval.fromString("interval 30 years"))) + checkAnswer(sql("select interval '25' month"), + Row(CalendarInterval.fromString("interval 25 months"))) + checkAnswer(sql("select interval '-100' day"), + Row(CalendarInterval.fromString("interval -14 weeks -2 days"))) + checkAnswer(sql("select interval '40' hour"), + Row(CalendarInterval.fromString("interval 1 days 16 hours"))) + checkAnswer(sql("select interval '80' minute"), + Row(CalendarInterval.fromString("interval 1 hour 20 minutes"))) + checkAnswer(sql("select interval '299.889987299' second"), + Row(CalendarInterval.fromString( + "interval 4 minutes 59 seconds 889 milliseconds 987 microseconds"))) + } + + test("specifying database name for a temporary table is not allowed") { + withTempPath { dir => + val path = dir.getCanonicalPath + val df = + sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str") + df + .write + .format("parquet") + .save(path) + + val message = intercept[AnalysisException] { + sqlContext.sql( + s""" + |CREATE TEMPORARY TABLE db.t + |USING parquet + |OPTIONS ( + | path '$path' + |) + """.stripMargin) + }.getMessage + assert(message.contains("Specifying database name or other qualifiers are not allowed")) + + // If you use backticks to quote the name of a temporary table having dot in it. + sqlContext.sql( + s""" + |CREATE TEMPORARY TABLE `db.t` + |USING parquet + |OPTIONS ( + | path '$path' + |) + """.stripMargin) + checkAnswer(sqlContext.table("`db.t`"), df) + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala index 0875232aede3..9aca40f15ac1 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala @@ -31,7 +31,8 @@ import org.apache.spark.sql.types.StringType class ScriptTransformationSuite extends SparkPlanTest { - override def sqlContext: SQLContext = TestHive + override def _sqlContext: SQLContext = TestHive + private val sqlContext = _sqlContext private val noSerdeIOSchema = HiveScriptIOSchema( inputRowFormat = Seq.empty, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala index af3f468aaa5e..593e68949ef4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala @@ -29,6 +29,14 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest { import sqlContext._ import sqlContext.implicits._ + // ORC does not play well with NullType and UDT. + override protected def supportsDataType(dataType: DataType): Boolean = dataType match { + case _: NullType => false + case _: CalendarIntervalType => false + case _: UserDefinedType[_] => false + case _ => true + } + test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) @@ -48,11 +56,9 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest { StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( - load( - source = dataSourceName, - options = Map( - "path" -> file.getCanonicalPath, - "dataSchema" -> dataSchemaWithPartition.json))) + read.options(Map( + "path" -> file.getCanonicalPath, + "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName).load()) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala index d463e8fd626f..a46ca9a2c970 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala @@ -31,7 +31,6 @@ import org.scalatest.BeforeAndAfterAll import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag - // The data where the partitioning key exists only in the directory structure. case class OrcParData(intField: Int, stringField: String) @@ -40,7 +39,7 @@ case class OrcParDataWithKey(intField: Int, pi: Int, stringField: String, ps: St // TODO This test suite duplicates ParquetPartitionDiscoverySuite a lot class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll { - val defaultPartitionName = ConfVars.DEFAULTPARTITIONNAME.defaultVal + val defaultPartitionName = ConfVars.DEFAULTPARTITIONNAME.defaultStrVal def withTempDir(f: File => Unit): Unit = { val dir = Utils.createTempDir().getCanonicalFile diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala index 82e08caf4645..80c38084f293 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala @@ -121,6 +121,35 @@ abstract class OrcSuite extends QueryTest with BeforeAndAfterAll { sql("SELECT * FROM normal_orc_as_source"), (6 to 10).map(i => Row(i, s"part-$i"))) } + + test("write null values") { + sql("DROP TABLE IF EXISTS orcNullValues") + + val df = sql( + """ + |SELECT + | CAST(null as TINYINT), + | CAST(null as SMALLINT), + | CAST(null as INT), + | CAST(null as BIGINT), + | CAST(null as FLOAT), + | CAST(null as DOUBLE), + | CAST(null as DECIMAL(7,2)), + | CAST(null as TIMESTAMP), + | CAST(null as DATE), + | CAST(null as STRING), + | CAST(null as VARCHAR(10)) + |FROM orc_temp_table limit 1 + """.stripMargin) + + df.write.format("orc").saveAsTable("orcNullValues") + + checkAnswer( + sql("SELECT * FROM orcNullValues"), + Row.fromSeq(Seq.fill(11)(null))) + + sql("DROP TABLE IF EXISTS orcNullValues") + } } class OrcSourceSuite extends OrcSuite { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala index 145965388da0..f7ba20ff41d8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala @@ -27,8 +27,8 @@ import org.apache.spark.sql._ import org.apache.spark.sql.test.SQLTestUtils private[sql] trait OrcTest extends SQLTestUtils { this: SparkFunSuite => - lazy val sqlContext = org.apache.spark.sql.hive.test.TestHive - + protected override def _sqlContext: SQLContext = org.apache.spark.sql.hive.test.TestHive + protected val sqlContext = _sqlContext import sqlContext.implicits._ import sqlContext.sparkContext diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala index f56fb96c52d3..34d3434569f5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.hive.execution.HiveTableScan import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ -import org.apache.spark.sql.parquet.ParquetRelation +import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ import org.apache.spark.util.Utils @@ -60,7 +60,14 @@ case class ParquetDataWithKeyAndComplexTypes( class ParquetMetastoreSuite extends ParquetPartitioningTest { override def beforeAll(): Unit = { super.beforeAll() - + dropTables("partitioned_parquet", + "partitioned_parquet_with_key", + "partitioned_parquet_with_complextypes", + "partitioned_parquet_with_key_and_complextypes", + "normal_parquet", + "jt", + "jt_array", + "test_parquet") sql(s""" create external table partitioned_parquet ( @@ -172,14 +179,14 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { } override def afterAll(): Unit = { - sql("DROP TABLE partitioned_parquet") - sql("DROP TABLE partitioned_parquet_with_key") - sql("DROP TABLE partitioned_parquet_with_complextypes") - sql("DROP TABLE partitioned_parquet_with_key_and_complextypes") - sql("DROP TABLE normal_parquet") - sql("DROP TABLE IF EXISTS jt") - sql("DROP TABLE IF EXISTS jt_array") - sql("DROP TABLE IF EXISTS test_parquet") + dropTables("partitioned_parquet", + "partitioned_parquet_with_key", + "partitioned_parquet_with_complextypes", + "partitioned_parquet_with_key_and_complextypes", + "normal_parquet", + "jt", + "jt_array", + "test_parquet") setConf(HiveContext.CONVERT_METASTORE_PARQUET, false) } @@ -203,6 +210,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { } test("insert into an empty parquet table") { + dropTables("test_insert_parquet") sql( """ |create table test_insert_parquet @@ -228,7 +236,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { sql(s"SELECT intField, stringField FROM test_insert_parquet WHERE intField > 2"), Row(3, "str3") :: Row(4, "str4") :: Nil ) - sql("DROP TABLE IF EXISTS test_insert_parquet") + dropTables("test_insert_parquet") // Create it again. sql( @@ -255,118 +263,118 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { sql(s"SELECT intField, stringField FROM test_insert_parquet"), (1 to 10).map(i => Row(i, s"str$i")) ++ (1 to 4).map(i => Row(i, s"str$i")) ) - sql("DROP TABLE IF EXISTS test_insert_parquet") + dropTables("test_insert_parquet") } test("scan a parquet table created through a CTAS statement") { - sql( - """ - |create table test_parquet_ctas ROW FORMAT - |SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - |STORED AS - | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - |AS select * from jt - """.stripMargin) + withTable("test_parquet_ctas") { + sql( + """ + |create table test_parquet_ctas ROW FORMAT + |SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + |STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + |AS select * from jt + """.stripMargin) - checkAnswer( - sql(s"SELECT a, b FROM test_parquet_ctas WHERE a = 1"), - Seq(Row(1, "str1")) - ) + checkAnswer( + sql(s"SELECT a, b FROM test_parquet_ctas WHERE a = 1"), + Seq(Row(1, "str1")) + ) - table("test_parquet_ctas").queryExecution.optimizedPlan match { - case LogicalRelation(_: ParquetRelation) => // OK - case _ => fail( - "test_parquet_ctas should be converted to " + - s"${classOf[ParquetRelation].getCanonicalName}") + table("test_parquet_ctas").queryExecution.optimizedPlan match { + case LogicalRelation(_: ParquetRelation) => // OK + case _ => fail( + "test_parquet_ctas should be converted to " + + s"${classOf[ParquetRelation].getCanonicalName }") + } } - - sql("DROP TABLE IF EXISTS test_parquet_ctas") } test("MetastoreRelation in InsertIntoTable will be converted") { - sql( - """ - |create table test_insert_parquet - |( - | intField INT - |) - |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - |STORED AS - | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - """.stripMargin) + withTable("test_insert_parquet") { + sql( + """ + |create table test_insert_parquet + |( + | intField INT + |) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + |STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + """.stripMargin) + + val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt") + df.queryExecution.executedPlan match { + case ExecutedCommand(InsertIntoHadoopFsRelation(_: ParquetRelation, _, _)) => // OK + case o => fail("test_insert_parquet should be converted to a " + + s"${classOf[ParquetRelation].getCanonicalName} and " + + s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan. " + + s"However, found a ${o.toString} ") + } - val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt") - df.queryExecution.executedPlan match { - case ExecutedCommand(InsertIntoHadoopFsRelation(_: ParquetRelation, _, _)) => // OK - case o => fail("test_insert_parquet should be converted to a " + - s"${classOf[ParquetRelation].getCanonicalName} and " + - s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan. " + - s"However, found a ${o.toString} ") + checkAnswer( + sql("SELECT intField FROM test_insert_parquet WHERE test_insert_parquet.intField > 5"), + sql("SELECT a FROM jt WHERE jt.a > 5").collect() + ) } - - checkAnswer( - sql("SELECT intField FROM test_insert_parquet WHERE test_insert_parquet.intField > 5"), - sql("SELECT a FROM jt WHERE jt.a > 5").collect() - ) - - sql("DROP TABLE IF EXISTS test_insert_parquet") } test("MetastoreRelation in InsertIntoHiveTable will be converted") { - sql( - """ - |create table test_insert_parquet - |( - | int_array array - |) - |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - |STORED AS - | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - """.stripMargin) + withTable("test_insert_parquet") { + sql( + """ + |create table test_insert_parquet + |( + | int_array array + |) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + |STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + """.stripMargin) + + val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt_array") + df.queryExecution.executedPlan match { + case ExecutedCommand(InsertIntoHadoopFsRelation(r: ParquetRelation, _, _)) => // OK + case o => fail("test_insert_parquet should be converted to a " + + s"${classOf[ParquetRelation].getCanonicalName} and " + + s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan." + + s"However, found a ${o.toString} ") + } - val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt_array") - df.queryExecution.executedPlan match { - case ExecutedCommand(InsertIntoHadoopFsRelation(r: ParquetRelation, _, _)) => // OK - case o => fail("test_insert_parquet should be converted to a " + - s"${classOf[ParquetRelation].getCanonicalName} and " + - s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan." + - s"However, found a ${o.toString} ") + checkAnswer( + sql("SELECT int_array FROM test_insert_parquet"), + sql("SELECT a FROM jt_array").collect() + ) } - - checkAnswer( - sql("SELECT int_array FROM test_insert_parquet"), - sql("SELECT a FROM jt_array").collect() - ) - - sql("DROP TABLE IF EXISTS test_insert_parquet") } test("SPARK-6450 regression test") { - sql( - """CREATE TABLE IF NOT EXISTS ms_convert (key INT) - |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - |STORED AS - | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - """.stripMargin) + withTable("ms_convert") { + sql( + """CREATE TABLE IF NOT EXISTS ms_convert (key INT) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + |STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + """.stripMargin) + + // This shouldn't throw AnalysisException + val analyzed = sql( + """SELECT key FROM ms_convert + |UNION ALL + |SELECT key FROM ms_convert + """.stripMargin).queryExecution.analyzed - // This shouldn't throw AnalysisException - val analyzed = sql( - """SELECT key FROM ms_convert - |UNION ALL - |SELECT key FROM ms_convert - """.stripMargin).queryExecution.analyzed - - assertResult(2) { - analyzed.collect { - case r @ LogicalRelation(_: ParquetRelation) => r - }.size + assertResult(2) { + analyzed.collect { + case r@LogicalRelation(_: ParquetRelation) => r + }.size + } } - - sql("DROP TABLE ms_convert") } def collectParquetRelation(df: DataFrame): ParquetRelation = { @@ -379,42 +387,42 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { } test("SPARK-7749: non-partitioned metastore Parquet table lookup should use cached relation") { - sql( - s"""CREATE TABLE nonPartitioned ( - | key INT, - | value STRING - |) - |STORED AS PARQUET - """.stripMargin) - - // First lookup fills the cache - val r1 = collectParquetRelation(table("nonPartitioned")) - // Second lookup should reuse the cache - val r2 = collectParquetRelation(table("nonPartitioned")) - // They should be the same instance - assert(r1 eq r2) - - sql("DROP TABLE nonPartitioned") + withTable("nonPartitioned") { + sql( + s"""CREATE TABLE nonPartitioned ( + | key INT, + | value STRING + |) + |STORED AS PARQUET + """.stripMargin) + + // First lookup fills the cache + val r1 = collectParquetRelation(table("nonPartitioned")) + // Second lookup should reuse the cache + val r2 = collectParquetRelation(table("nonPartitioned")) + // They should be the same instance + assert(r1 eq r2) + } } test("SPARK-7749: partitioned metastore Parquet table lookup should use cached relation") { - sql( - s"""CREATE TABLE partitioned ( - | key INT, - | value STRING - |) - |PARTITIONED BY (part INT) - |STORED AS PARQUET + withTable("partitioned") { + sql( + s"""CREATE TABLE partitioned ( + | key INT, + | value STRING + |) + |PARTITIONED BY (part INT) + |STORED AS PARQUET """.stripMargin) - // First lookup fills the cache - val r1 = collectParquetRelation(table("partitioned")) - // Second lookup should reuse the cache - val r2 = collectParquetRelation(table("partitioned")) - // They should be the same instance - assert(r1 eq r2) - - sql("DROP TABLE partitioned") + // First lookup fills the cache + val r1 = collectParquetRelation(table("partitioned")) + // Second lookup should reuse the cache + val r2 = collectParquetRelation(table("partitioned")) + // They should be the same instance + assert(r1 eq r2) + } } test("Caching converted data source Parquet Relations") { @@ -430,8 +438,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { } } - sql("DROP TABLE IF EXISTS test_insert_parquet") - sql("DROP TABLE IF EXISTS test_parquet_partitioned_cache_test") + dropTables("test_insert_parquet", "test_parquet_partitioned_cache_test") sql( """ @@ -479,7 +486,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { | intField INT, | stringField STRING |) - |PARTITIONED BY (date string) + |PARTITIONED BY (`date` string) |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' |STORED AS | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' @@ -491,7 +498,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { sql( """ |INSERT INTO TABLE test_parquet_partitioned_cache_test - |PARTITION (date='2015-04-01') + |PARTITION (`date`='2015-04-01') |select a, b from jt """.stripMargin) // Right now, insert into a partitioned Parquet is not supported in data source Parquet. @@ -500,7 +507,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { sql( """ |INSERT INTO TABLE test_parquet_partitioned_cache_test - |PARTITION (date='2015-04-02') + |PARTITION (`date`='2015-04-02') |select a, b from jt """.stripMargin) assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) === null) @@ -510,7 +517,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { checkCached(tableIdentifier) // Make sure we can read the data. checkAnswer( - sql("select STRINGField, date, intField from test_parquet_partitioned_cache_test"), + sql("select STRINGField, `date`, intField from test_parquet_partitioned_cache_test"), sql( """ |select b, '2015-04-01', a FROM jt @@ -521,8 +528,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { invalidateTable("test_parquet_partitioned_cache_test") assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) === null) - sql("DROP TABLE test_insert_parquet") - sql("DROP TABLE test_parquet_partitioned_cache_test") + dropTables("test_insert_parquet", "test_parquet_partitioned_cache_test") } } @@ -532,6 +538,11 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { class ParquetSourceSuite extends ParquetPartitioningTest { override def beforeAll(): Unit = { super.beforeAll() + dropTables("partitioned_parquet", + "partitioned_parquet_with_key", + "partitioned_parquet_with_complextypes", + "partitioned_parquet_with_key_and_complextypes", + "normal_parquet") sql( s""" create temporary table partitioned_parquet @@ -635,22 +646,22 @@ class ParquetSourceSuite extends ParquetPartitioningTest { StructField("a", arrayType1, nullable = true) :: Nil) assert(df.schema === expectedSchema1) - df.write.format("parquet").saveAsTable("alwaysNullable") + withTable("alwaysNullable") { + df.write.format("parquet").saveAsTable("alwaysNullable") - val mapType2 = MapType(IntegerType, IntegerType, valueContainsNull = true) - val arrayType2 = ArrayType(IntegerType, containsNull = true) - val expectedSchema2 = - StructType( - StructField("m", mapType2, nullable = true) :: - StructField("a", arrayType2, nullable = true) :: Nil) + val mapType2 = MapType(IntegerType, IntegerType, valueContainsNull = true) + val arrayType2 = ArrayType(IntegerType, containsNull = true) + val expectedSchema2 = + StructType( + StructField("m", mapType2, nullable = true) :: + StructField("a", arrayType2, nullable = true) :: Nil) - assert(table("alwaysNullable").schema === expectedSchema2) - - checkAnswer( - sql("SELECT m, a FROM alwaysNullable"), - Row(Map(2 -> 3), Seq(4, 5, 6))) + assert(table("alwaysNullable").schema === expectedSchema2) - sql("DROP TABLE alwaysNullable") + checkAnswer( + sql("SELECT m, a FROM alwaysNullable"), + Row(Map(2 -> 3), Seq(4, 5, 6))) + } } test("Aggregation attribute names can't contain special chars \" ,;{}()\\n\\t=\"") { @@ -674,7 +685,8 @@ class ParquetSourceSuite extends ParquetPartitioningTest { * A collection of tests for parquet data with various forms of partitioning. */ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with BeforeAndAfterAll { - override def sqlContext: SQLContext = TestHive + override def _sqlContext: SQLContext = TestHive + protected val sqlContext = _sqlContext var partitionedTableDir: File = null var normalTableDir: File = null @@ -738,6 +750,16 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with partitionedTableDirWithKeyAndComplexTypes.delete() } + /** + * Drop named tables if they exist + * @param tableNames tables to drop + */ + def dropTables(tableNames: String*): Unit = { + tableNames.foreach { name => + sql(s"DROP TABLE IF EXISTS $name") + } + } + Seq( "partitioned_parquet", "partitioned_parquet_with_key", diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala index e976125b3706..b4640b161628 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala @@ -18,14 +18,16 @@ package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path -import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils { - override val sqlContext = TestHive + override def _sqlContext: SQLContext = TestHive + private val sqlContext = _sqlContext // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala new file mode 100644 index 000000000000..f7386e0db576 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.sources + +import java.math.BigDecimal + +import org.apache.hadoop.fs.Path + +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ + +class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { + override val dataSourceName: String = "json" + + import sqlContext._ + + // JSON does not write data of NullType and does not play well with BinaryType. + override protected def supportsDataType(dataType: DataType): Boolean = dataType match { + case _: NullType => false + case _: BinaryType => false + case _: CalendarIntervalType => false + case _ => true + } + + test("save()/load() - partitioned table - simple queries - partition columns in data") { + withTempDir { file => + val basePath = new Path(file.getCanonicalPath) + val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) + val qualifiedBasePath = fs.makeQualified(basePath) + + for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { + val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") + sparkContext + .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""") + .saveAsTextFile(partitionDir.toString) + } + + val dataSchemaWithPartition = + StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) + + checkQueries( + read.format(dataSourceName) + .option("dataSchema", dataSchemaWithPartition.json) + .load(file.getCanonicalPath)) + } + } + + test("SPARK-9894: save complex types to JSON") { + withTempDir { file => + file.delete() + + val schema = + new StructType() + .add("array", ArrayType(LongType)) + .add("map", MapType(StringType, new StructType().add("innerField", LongType))) + + val data = + Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) :: + Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil + val df = createDataFrame(sparkContext.parallelize(data), schema) + + // Write the data out. + df.write.format(dataSourceName).save(file.getCanonicalPath) + + // Read it back and check the result. + checkAnswer( + read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), + df + ) + } + } + + test("SPARK-10196: save decimal type to JSON") { + withTempDir { file => + file.delete() + + val schema = + new StructType() + .add("decimal", DecimalType(7, 2)) + + val data = + Row(new BigDecimal("10.02")) :: + Row(new BigDecimal("20000.99")) :: + Row(new BigDecimal("10000")) :: Nil + val df = createDataFrame(sparkContext.parallelize(data), schema) + + // Write the data out. + df.write.format(dataSourceName).save(file.getCanonicalPath) + + // Read it back and check the result. + checkAnswer( + read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), + df + ) + } + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala index d280543a071d..5275ae6511f1 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala @@ -23,16 +23,23 @@ import com.google.common.io.Files import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.sql.{AnalysisException, SaveMode, parquet} -import org.apache.spark.sql.types.{IntegerType, StructField, StructType} +import org.apache.spark.sql.{execution, AnalysisException, SaveMode} +import org.apache.spark.sql.types._ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest { - override val dataSourceName: String = classOf[parquet.DefaultSource].getCanonicalName + override val dataSourceName: String = "parquet" import sqlContext._ import sqlContext.implicits._ + // Parquet does not play well with NullType. + override protected def supportsDataType(dataType: DataType): Boolean = dataType match { + case _: NullType => false + case _: CalendarIntervalType => false + case _ => true + } + test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) @@ -136,4 +143,17 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest { assert(fs.exists(commonSummaryPath)) } } + + test("SPARK-10334 Projections and filters should be kept in physical plan") { + withTempPath { dir => + val path = dir.getCanonicalPath + + sqlContext.range(2).select('id as 'a, 'id as 'b).write.partitionBy("b").parquet(path) + val df = sqlContext.read.parquet(path).filter('a === 0).select('b) + val physicalPlan = df.queryExecution.executedPlan + + assert(physicalPlan.collect { case p: execution.Project => p }.length === 1) + assert(physicalPlan.collect { case p: execution.Filter => p }.length === 1) + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala index e8975e5f5cd0..bd0abecd37a4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala @@ -20,13 +20,30 @@ package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.sql.types.{IntegerType, StructField, StructType} +import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName import sqlContext._ + // We have a very limited number of supported types at here since it is just for a + // test relation and we do very basic testing at here. + override protected def supportsDataType(dataType: DataType): Boolean = dataType match { + case _: BinaryType => false + // We are using random data generator and the generated strings are not really valid string. + case _: StringType => false + case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 + case _: CalendarIntervalType => false + case _: DateType => false + case _: TimestampType => false + case _: ArrayType => false + case _: MapType => false + case _: StructType => false + case _: UserDefinedType[_] => false + case _ => true + } + test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala index e8141923a9b5..aeaaa3e1c522 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala @@ -27,6 +27,7 @@ import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputForma import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.spark.rdd.RDD +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{Cast, Literal} import org.apache.spark.sql.types.{DataType, StructType} @@ -53,8 +54,10 @@ class AppendingTextOutputFormat(outputFile: Path) extends TextOutputFormat[NullW numberFormat.setGroupingUsed(false) override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { - val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") - val split = context.getTaskAttemptID.getTaskID.getId + val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context) + val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") + val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context) + val split = taskAttemptId.getTaskID.getId val name = FileOutputFormat.getOutputName(context) new Path(outputFile, s"$name-${numberFormat.format(split)}-$uniqueWriteJobId") } @@ -65,7 +68,9 @@ class SimpleTextOutputWriter(path: String, context: TaskAttemptContext) extends new AppendingTextOutputFormat(new Path(path)).getRecordWriter(context) override def write(row: Row): Unit = { - val serialized = row.toSeq.map(_.toString).mkString(",") + val serialized = row.toSeq.map { v => + if (v == null) "" else v.toString + }.mkString(",") recordWriter.write(null, new Text(serialized)) } @@ -109,7 +114,8 @@ class SimpleTextRelation( val fields = dataSchema.map(_.dataType) sparkContext.textFile(inputStatuses.map(_.getPath).mkString(",")).map { record => - Row(record.split(",").zip(fields).map { case (value, dataType) => + Row(record.split(",", -1).zip(fields).map { case (v, dataType) => + val value = if (v == "") null else v // `Cast`ed values are always of Catalyst types (i.e. UTF8String instead of String, etc.) val catalystValue = Cast(Literal(value), dataType).eval() // Here we're converting Catalyst values to Scala values to test `needsConversion` diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala index dd274023a1cf..0f1cddda9c57 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala @@ -34,13 +34,14 @@ import org.apache.spark.sql.types._ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils { - override lazy val sqlContext: SQLContext = TestHive - - import sqlContext.sql + override def _sqlContext: SQLContext = TestHive + protected val sqlContext = _sqlContext import sqlContext.implicits._ val dataSourceName: String + protected def supportsDataType(dataType: DataType): Boolean = true + val dataSchema = StructType( Seq( @@ -101,6 +102,83 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils { } } + ignore("test all data types") { + withTempPath { file => + // Create the schema. + val struct = + StructType( + StructField("f1", FloatType, true) :: + StructField("f2", ArrayType(BooleanType), true) :: Nil) + // TODO: add CalendarIntervalType to here once we can save it out. + val dataTypes = + Seq( + StringType, BinaryType, NullType, BooleanType, + ByteType, ShortType, IntegerType, LongType, + FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5), + DateType, TimestampType, + ArrayType(IntegerType), MapType(StringType, LongType), struct, + new MyDenseVectorUDT()) + val fields = dataTypes.zipWithIndex.map { case (dataType, index) => + StructField(s"col$index", dataType, nullable = true) + } + val schema = StructType(fields) + + // Generate data at the driver side. We need to materialize the data first and then + // create RDD. + val maybeDataGenerator = + RandomDataGenerator.forType( + dataType = schema, + nullable = true, + seed = Some(System.nanoTime())) + val dataGenerator = + maybeDataGenerator + .getOrElse(fail(s"Failed to create data generator for schema $schema")) + val data = (1 to 10).map { i => + dataGenerator.apply() match { + case row: Row => row + case null => Row.fromSeq(Seq.fill(schema.length)(null)) + case other => + fail(s"Row or null is expected to be generated, " + + s"but a ${other.getClass.getCanonicalName} is generated.") + } + } + + // Create a DF for the schema with random data. + val rdd = sqlContext.sparkContext.parallelize(data, 10) + val df = sqlContext.createDataFrame(rdd, schema) + + // All columns that have supported data types of this source. + val supportedColumns = schema.fields.collect { + case StructField(name, dataType, _, _) if supportsDataType(dataType) => name + } + val selectedColumns = util.Random.shuffle(supportedColumns.toSeq) + + val dfToBeSaved = df.selectExpr(selectedColumns: _*) + + // Save the data out. + dfToBeSaved + .write + .format(dataSourceName) + .option("dataSchema", dfToBeSaved.schema.json) // This option is just used by tests. + .save(file.getCanonicalPath) + + val loadedDF = + sqlContext + .read + .format(dataSourceName) + .schema(dfToBeSaved.schema) + .option("dataSchema", dfToBeSaved.schema.json) // This option is just used by tests. + .load(file.getCanonicalPath) + .selectExpr(selectedColumns: _*) + + // Read the data back. + checkAnswer( + loadedDF, + dfToBeSaved + ) + } + } + test("save()/load() - non-partitioned table - Overwrite") { withTempPath { file => testDF.write.mode(SaveMode.Overwrite).format(dataSourceName).save(file.getCanonicalPath) @@ -444,7 +522,9 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils { } } - test("Partition column type casting") { + // HadoopFsRelation.discoverPartitions() called by refresh(), which will ignore + // the given partition data type. + ignore("Partition column type casting") { withTempPath { file => val input = partitionedTestDF.select('a, 'b, 'p1.cast(StringType).as('ps), 'p2) @@ -552,6 +632,40 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils { clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue)) } } + + test("SPARK-9899 Disable customized output committer when speculation is on") { + val clonedConf = new Configuration(configuration) + val speculationEnabled = + sqlContext.sparkContext.conf.getBoolean("spark.speculation", defaultValue = false) + + try { + withTempPath { dir => + // Enables task speculation + sqlContext.sparkContext.conf.set("spark.speculation", "true") + + // Uses a customized output committer which always fails + configuration.set( + SQLConf.OUTPUT_COMMITTER_CLASS.key, + classOf[AlwaysFailOutputCommitter].getName) + + // Code below shouldn't throw since customized output committer should be disabled. + val df = sqlContext.range(10).coalesce(1) + df.write.format(dataSourceName).save(dir.getCanonicalPath) + checkAnswer( + sqlContext + .read + .format(dataSourceName) + .option("dataSchema", df.schema.json) + .load(dir.getCanonicalPath), + df) + } + } finally { + // Hadoop 1 doesn't have `Configuration.unset` + configuration.clear() + clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue)) + sqlContext.sparkContext.conf.set("spark.speculation", speculationEnabled.toString) + } + } } // This class is used to test SPARK-8578. We should not use any custom output committer when diff --git a/streaming/pom.xml b/streaming/pom.xml index 697895e72fe5..ad289d4e754a 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml diff --git a/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLog.java b/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLog.java index 8c0fdfa9c747..3738fc1a235c 100644 --- a/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLog.java +++ b/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLog.java @@ -21,6 +21,8 @@ import java.util.Iterator; /** + * :: DeveloperApi :: + * * This abstract class represents a write ahead log (aka journal) that is used by Spark Streaming * to save the received data (by receivers) and associated metadata to a reliable storage, so that * they can be recovered after driver failures. See the Spark documentation for more information diff --git a/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLogRecordHandle.java b/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLogRecordHandle.java index 02324189b782..662889e779fb 100644 --- a/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLogRecordHandle.java +++ b/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLogRecordHandle.java @@ -18,6 +18,8 @@ package org.apache.spark.streaming.util; /** + * :: DeveloperApi :: + * * This abstract class represents a handle that refers to a record written in a * {@link org.apache.spark.streaming.util.WriteAheadLog WriteAheadLog}. * It must contain all the information necessary for the record to be read and returned by diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 2780d5b6adbc..27024ecfd910 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -49,6 +49,8 @@ class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time) // Reload properties for the checkpoint application since user wants to set a reload property // or spark had changed its value and user wants to set it back. val propertiesToReload = List( + "spark.yarn.app.id", + "spark.yarn.app.attemptId", "spark.driver.host", "spark.driver.port", "spark.master", @@ -192,7 +194,9 @@ class CheckpointWriter( + "'") // Write checkpoint to temp file - fs.delete(tempFile, true) // just in case it exists + if (fs.exists(tempFile)) { + fs.delete(tempFile, true) // just in case it exists + } val fos = fs.create(tempFile) Utils.tryWithSafeFinally { fos.write(bytes) @@ -203,7 +207,9 @@ class CheckpointWriter( // If the checkpoint file exists, back it up // If the backup exists as well, just delete it, otherwise rename will fail if (fs.exists(checkpointFile)) { - fs.delete(backupFile, true) // just in case it exists + if (fs.exists(backupFile)){ + fs.delete(backupFile, true) // just in case it exists + } if (!fs.rename(checkpointFile, backupFile)) { logWarning("Could not rename " + checkpointFile + " to " + backupFile) } @@ -282,6 +288,15 @@ class CheckpointWriter( private[streaming] object CheckpointReader extends Logging { + /** + * Read checkpoint files present in the given checkpoint directory. If there are no checkpoint + * files, then return None, else try to return the latest valid checkpoint object. If no + * checkpoint files could be read correctly, then return None. + */ + def read(checkpointDir: String): Option[Checkpoint] = { + read(checkpointDir, new SparkConf(), SparkHadoopUtil.get.conf, ignoreReadError = true) + } + /** * Read checkpoint files present in the given checkpoint directory. If there are no checkpoint * files, then return None, else try to return the latest valid checkpoint object. If no @@ -306,7 +321,7 @@ object CheckpointReader extends Logging { // Try to read the checkpoint files in the order logInfo("Checkpoint files found: " + checkpointFiles.mkString(",")) - val compressionCodec = CompressionCodec.createCodec(conf) + var readError: Exception = null checkpointFiles.foreach(file => { logInfo("Attempting to load checkpoint from file " + file) try { @@ -317,13 +332,15 @@ object CheckpointReader extends Logging { return Some(cp) } catch { case e: Exception => + readError = e logWarning("Error reading checkpoint from file " + file, e) } }) // If none of checkpoint files could be read, then throw exception if (!ignoreReadError) { - throw new SparkException(s"Failed to read checkpoint from directory $checkpointPath") + throw new SparkException( + s"Failed to read checkpoint from directory $checkpointPath", readError) } None } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index 177e710ace54..b496d1f341a0 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -44,7 +44,7 @@ import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.receiver.{ActorReceiver, ActorSupervisorStrategy, Receiver} import org.apache.spark.streaming.scheduler.{JobScheduler, StreamingListener} import org.apache.spark.streaming.ui.{StreamingJobProgressListener, StreamingTab} -import org.apache.spark.util.{CallSite, Utils} +import org.apache.spark.util.{CallSite, ShutdownHookManager, Utils} /** * Main entry point for Spark Streaming functionality. It provides methods used to create @@ -604,7 +604,7 @@ class StreamingContext private[streaming] ( } StreamingContext.setActiveContext(this) } - shutdownHookRef = Utils.addShutdownHook( + shutdownHookRef = ShutdownHookManager.addShutdownHook( StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown) // Registering Streaming Metrics at the start of the StreamingContext assert(env.metricsSystem != null) @@ -691,7 +691,7 @@ class StreamingContext private[streaming] ( StreamingContext.setActiveContext(null) waiter.notifyStop() if (shutdownHookRef != null) { - Utils.removeShutdownHook(shutdownHookRef) + ShutdownHookManager.removeShutdownHook(shutdownHookRef) } logInfo("StreamingContext stopped successfully") } @@ -725,7 +725,7 @@ object StreamingContext extends Logging { */ private val ACTIVATION_LOCK = new Object() - private val SHUTDOWN_HOOK_PRIORITY = Utils.SPARK_CONTEXT_SHUTDOWN_PRIORITY + 1 + private val SHUTDOWN_HOOK_PRIORITY = ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY + 1 private val activeContext = new AtomicReference[StreamingContext](null) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala index 808dcc174cf9..214cd80108b9 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala @@ -291,7 +291,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T * * @deprecated As of release 0.9.0, replaced by foreachRDD */ - @Deprecated + @deprecated("Use foreachRDD", "0.9.0") def foreach(foreachFunc: JFunction[R, Void]) { foreachRDD(foreachFunc) } @@ -302,7 +302,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T * * @deprecated As of release 0.9.0, replaced by foreachRDD */ - @Deprecated + @deprecated("Use foreachRDD", "0.9.0") def foreach(foreachFunc: JFunction2[R, Time, Void]) { foreachRDD(foreachFunc) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala index a2f5d82a79bd..bab78a3536b4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala @@ -17,7 +17,7 @@ package org.apache.spark.streaming.dstream -import java.io.{NotSerializableException, ObjectOutputStream} +import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag @@ -37,8 +37,13 @@ class QueueInputDStream[T: ClassTag]( override def stop() { } + private def readObject(in: ObjectInputStream): Unit = { + throw new NotSerializableException("queueStream doesn't support checkpointing. " + + "Please don't use queueStream when checkpointing is enabled.") + } + private def writeObject(oos: ObjectOutputStream): Unit = { - throw new NotSerializableException("queueStream doesn't support checkpointing") + logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala index 670ef8d296a0..6c139f32da31 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala @@ -21,12 +21,12 @@ import scala.reflect.ClassTag import org.apache.spark.rdd.{BlockRDD, RDD} import org.apache.spark.storage.BlockId -import org.apache.spark.streaming.{StreamingContext, Time} import org.apache.spark.streaming.rdd.WriteAheadLogBackedBlockRDD import org.apache.spark.streaming.receiver.Receiver -import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo} import org.apache.spark.streaming.scheduler.rate.RateEstimator +import org.apache.spark.streaming.scheduler.{ReceivedBlockInfo, RateController, StreamInputInfo} import org.apache.spark.streaming.util.WriteAheadLogUtils +import org.apache.spark.streaming.{StreamingContext, Time} /** * Abstract class for defining any [[org.apache.spark.streaming.dstream.InputDStream]] @@ -79,48 +79,63 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont // for this batch val receiverTracker = ssc.scheduler.receiverTracker val blockInfos = receiverTracker.getBlocksOfBatch(validTime).getOrElse(id, Seq.empty) - val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray // Register the input blocks information into InputInfoTracker val inputInfo = StreamInputInfo(id, blockInfos.flatMap(_.numRecords).sum) ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo) - if (blockInfos.nonEmpty) { - // Are WAL record handles present with all the blocks - val areWALRecordHandlesPresent = blockInfos.forall { _.walRecordHandleOption.nonEmpty } + // Create the BlockRDD + createBlockRDD(validTime, blockInfos) + } + } + Some(blockRDD) + } + + private[streaming] def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { - if (areWALRecordHandlesPresent) { - // If all the blocks have WAL record handle, then create a WALBackedBlockRDD - val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray - val walRecordHandles = blockInfos.map { _.walRecordHandleOption.get }.toArray - new WriteAheadLogBackedBlockRDD[T]( - ssc.sparkContext, blockIds, walRecordHandles, isBlockIdValid) - } else { - // Else, create a BlockRDD. However, if there are some blocks with WAL info but not - // others then that is unexpected and log a warning accordingly. - if (blockInfos.find(_.walRecordHandleOption.nonEmpty).nonEmpty) { - if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) { - logError("Some blocks do not have Write Ahead Log information; " + - "this is unexpected and data may not be recoverable after driver failures") - } else { - logWarning("Some blocks have Write Ahead Log information; this is unexpected") - } - } - new BlockRDD[T](ssc.sc, blockIds) - } - } else { - // If no block is ready now, creating WriteAheadLogBackedBlockRDD or BlockRDD - // according to the configuration + if (blockInfos.nonEmpty) { + val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray + + // Are WAL record handles present with all the blocks + val areWALRecordHandlesPresent = blockInfos.forall { _.walRecordHandleOption.nonEmpty } + + if (areWALRecordHandlesPresent) { + // If all the blocks have WAL record handle, then create a WALBackedBlockRDD + val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray + val walRecordHandles = blockInfos.map { _.walRecordHandleOption.get }.toArray + new WriteAheadLogBackedBlockRDD[T]( + ssc.sparkContext, blockIds, walRecordHandles, isBlockIdValid) + } else { + // Else, create a BlockRDD. However, if there are some blocks with WAL info but not + // others then that is unexpected and log a warning accordingly. + if (blockInfos.find(_.walRecordHandleOption.nonEmpty).nonEmpty) { if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) { - new WriteAheadLogBackedBlockRDD[T]( - ssc.sparkContext, Array.empty, Array.empty, Array.empty) + logError("Some blocks do not have Write Ahead Log information; " + + "this is unexpected and data may not be recoverable after driver failures") } else { - new BlockRDD[T](ssc.sc, Array.empty) + logWarning("Some blocks have Write Ahead Log information; this is unexpected") } } + val validBlockIds = blockIds.filter { id => + ssc.sparkContext.env.blockManager.master.contains(id) + } + if (validBlockIds.size != blockIds.size) { + logWarning("Some blocks could not be recovered as they were not found in memory. " + + "To prevent such data loss, enabled Write Ahead Log (see programming guide " + + "for more details.") + } + new BlockRDD[T](ssc.sc, validBlockIds) + } + } else { + // If no block is ready now, creating WriteAheadLogBackedBlockRDD or BlockRDD + // according to the configuration + if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) { + new WriteAheadLogBackedBlockRDD[T]( + ssc.sparkContext, Array.empty, Array.empty, Array.empty) + } else { + new BlockRDD[T](ssc.sc, Array.empty) } } - Some(blockRDD) } /** diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala index 31ce8e1ec14d..e081ffe46f50 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala @@ -75,7 +75,7 @@ private[streaming] class WriteAheadLogBackedBlockRDD[T: ClassTag]( @transient sc: SparkContext, @transient blockIds: Array[BlockId], - @transient walRecordHandles: Array[WriteAheadLogRecordHandle], + @transient val walRecordHandles: Array[WriteAheadLogRecordHandle], @transient isBlockIdValid: Array[Boolean] = Array.empty, storeInBlockManager: Boolean = false, storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY_SER) @@ -84,7 +84,7 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag]( require( blockIds.length == walRecordHandles.length, s"Number of block Ids (${blockIds.length}) must be " + - s" same as number of WAL record handles (${walRecordHandles.length}})") + s" same as number of WAL record handles (${walRecordHandles.length})") require( isBlockIdValid.isEmpty || isBlockIdValid.length == blockIds.length, diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala index cd309788a771..7ec74016a1c2 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala @@ -144,7 +144,7 @@ private[streaming] class ActorReceiver[T: ClassTag]( receiverSupervisorStrategy: SupervisorStrategy ) extends Receiver[T](storageLevel) with Logging { - protected lazy val supervisor = SparkEnv.get.actorSystem.actorOf(Props(new Supervisor), + protected lazy val actorSupervisor = SparkEnv.get.actorSystem.actorOf(Props(new Supervisor), "Supervisor" + streamId) class Supervisor extends Actor { @@ -191,11 +191,11 @@ private[streaming] class ActorReceiver[T: ClassTag]( } def onStart(): Unit = { - supervisor - logInfo("Supervision tree for receivers initialized at:" + supervisor.path) + actorSupervisor + logInfo("Supervision tree for receivers initialized at:" + actorSupervisor.path) } def onStop(): Unit = { - supervisor ! PoisonPill + actorSupervisor ! PoisonPill } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala index 92b51ce39234..421d60ae359f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala @@ -21,10 +21,10 @@ import java.util.concurrent.{ArrayBlockingQueue, TimeUnit} import scala.collection.mutable.ArrayBuffer -import org.apache.spark.{Logging, SparkConf} +import org.apache.spark.{SparkException, Logging, SparkConf} import org.apache.spark.storage.StreamBlockId import org.apache.spark.streaming.util.RecurringTimer -import org.apache.spark.util.SystemClock +import org.apache.spark.util.{Clock, SystemClock} /** Listener object for BlockGenerator events */ private[streaming] trait BlockGeneratorListener { @@ -69,16 +69,35 @@ private[streaming] trait BlockGeneratorListener { * named blocks at regular intervals. This class starts two threads, * one to periodically start a new batch and prepare the previous batch of as a block, * the other to push the blocks into the block manager. + * + * Note: Do not create BlockGenerator instances directly inside receivers. Use + * `ReceiverSupervisor.createBlockGenerator` to create a BlockGenerator and use it. */ private[streaming] class BlockGenerator( listener: BlockGeneratorListener, receiverId: Int, - conf: SparkConf + conf: SparkConf, + clock: Clock = new SystemClock() ) extends RateLimiter(conf) with Logging { private case class Block(id: StreamBlockId, buffer: ArrayBuffer[Any]) - private val clock = new SystemClock() + /** + * The BlockGenerator can be in 5 possible states, in the order as follows. + * - Initialized: Nothing has been started + * - Active: start() has been called, and it is generating blocks on added data. + * - StoppedAddingData: stop() has been called, the adding of data has been stopped, + * but blocks are still being generated and pushed. + * - StoppedGeneratingBlocks: Generating of blocks has been stopped, but + * they are still being pushed. + * - StoppedAll: Everything has stopped, and the BlockGenerator object can be GCed. + */ + private object GeneratorState extends Enumeration { + type GeneratorState = Value + val Initialized, Active, StoppedAddingData, StoppedGeneratingBlocks, StoppedAll = Value + } + import GeneratorState._ + private val blockIntervalMs = conf.getTimeAsMs("spark.streaming.blockInterval", "200ms") require(blockIntervalMs > 0, s"'spark.streaming.blockInterval' should be a positive value") @@ -89,70 +108,140 @@ private[streaming] class BlockGenerator( private val blockPushingThread = new Thread() { override def run() { keepPushingBlocks() } } @volatile private var currentBuffer = new ArrayBuffer[Any] - @volatile private var stopped = false + @volatile private var state = Initialized /** Start block generating and pushing threads. */ - def start() { - blockIntervalTimer.start() - blockPushingThread.start() - logInfo("Started BlockGenerator") + def start(): Unit = synchronized { + if (state == Initialized) { + state = Active + blockIntervalTimer.start() + blockPushingThread.start() + logInfo("Started BlockGenerator") + } else { + throw new SparkException( + s"Cannot start BlockGenerator as its not in the Initialized state [state = $state]") + } } - /** Stop all threads. */ - def stop() { + /** + * Stop everything in the right order such that all the data added is pushed out correctly. + * - First, stop adding data to the current buffer. + * - Second, stop generating blocks. + * - Finally, wait for queue of to-be-pushed blocks to be drained. + */ + def stop(): Unit = { + // Set the state to stop adding data + synchronized { + if (state == Active) { + state = StoppedAddingData + } else { + logWarning(s"Cannot stop BlockGenerator as its not in the Active state [state = $state]") + return + } + } + + // Stop generating blocks and set the state for block pushing thread to start draining the queue logInfo("Stopping BlockGenerator") blockIntervalTimer.stop(interruptTimer = false) - stopped = true - logInfo("Waiting for block pushing thread") + synchronized { state = StoppedGeneratingBlocks } + + // Wait for the queue to drain and mark generated as stopped + logInfo("Waiting for block pushing thread to terminate") blockPushingThread.join() + synchronized { state = StoppedAll } logInfo("Stopped BlockGenerator") } /** - * Push a single data item into the buffer. All received data items - * will be periodically pushed into BlockManager. + * Push a single data item into the buffer. */ - def addData (data: Any): Unit = synchronized { - waitToPush() - currentBuffer += data + def addData(data: Any): Unit = { + if (state == Active) { + waitToPush() + synchronized { + if (state == Active) { + currentBuffer += data + } else { + throw new SparkException( + "Cannot add data as BlockGenerator has not been started or has been stopped") + } + } + } else { + throw new SparkException( + "Cannot add data as BlockGenerator has not been started or has been stopped") + } } /** * Push a single data item into the buffer. After buffering the data, the - * `BlockGeneratorListener.onAddData` callback will be called. All received data items - * will be periodically pushed into BlockManager. + * `BlockGeneratorListener.onAddData` callback will be called. */ - def addDataWithCallback(data: Any, metadata: Any): Unit = synchronized { - waitToPush() - currentBuffer += data - listener.onAddData(data, metadata) + def addDataWithCallback(data: Any, metadata: Any): Unit = { + if (state == Active) { + waitToPush() + synchronized { + if (state == Active) { + currentBuffer += data + listener.onAddData(data, metadata) + } else { + throw new SparkException( + "Cannot add data as BlockGenerator has not been started or has been stopped") + } + } + } else { + throw new SparkException( + "Cannot add data as BlockGenerator has not been started or has been stopped") + } } /** * Push multiple data items into the buffer. After buffering the data, the - * `BlockGeneratorListener.onAddData` callback will be called. All received data items - * will be periodically pushed into BlockManager. Note that all the data items is guaranteed - * to be present in a single block. + * `BlockGeneratorListener.onAddData` callback will be called. Note that all the data items + * are atomically added to the buffer, and are hence guaranteed to be present in a single block. */ - def addMultipleDataWithCallback(dataIterator: Iterator[Any], metadata: Any): Unit = synchronized { - dataIterator.foreach { data => - waitToPush() - currentBuffer += data + def addMultipleDataWithCallback(dataIterator: Iterator[Any], metadata: Any): Unit = { + if (state == Active) { + // Unroll iterator into a temp buffer, and wait for pushing in the process + val tempBuffer = new ArrayBuffer[Any] + dataIterator.foreach { data => + waitToPush() + tempBuffer += data + } + synchronized { + if (state == Active) { + currentBuffer ++= tempBuffer + listener.onAddData(tempBuffer, metadata) + } else { + throw new SparkException( + "Cannot add data as BlockGenerator has not been started or has been stopped") + } + } + } else { + throw new SparkException( + "Cannot add data as BlockGenerator has not been started or has been stopped") } - listener.onAddData(dataIterator, metadata) } + def isActive(): Boolean = state == Active + + def isStopped(): Boolean = state == StoppedAll + /** Change the buffer to which single records are added to. */ - private def updateCurrentBuffer(time: Long): Unit = synchronized { + private def updateCurrentBuffer(time: Long): Unit = { try { - val newBlockBuffer = currentBuffer - currentBuffer = new ArrayBuffer[Any] - if (newBlockBuffer.size > 0) { - val blockId = StreamBlockId(receiverId, time - blockIntervalMs) - val newBlock = new Block(blockId, newBlockBuffer) - listener.onGenerateBlock(blockId) + var newBlock: Block = null + synchronized { + if (currentBuffer.nonEmpty) { + val newBlockBuffer = currentBuffer + currentBuffer = new ArrayBuffer[Any] + val blockId = StreamBlockId(receiverId, time - blockIntervalMs) + listener.onGenerateBlock(blockId) + newBlock = new Block(blockId, newBlockBuffer) + } + } + + if (newBlock != null) { blocksForPushing.put(newBlock) // put is blocking when queue is full - logDebug("Last element in " + blockId + " is " + newBlockBuffer.last) } } catch { case ie: InterruptedException => @@ -165,18 +254,25 @@ private[streaming] class BlockGenerator( /** Keep pushing blocks to the BlockManager. */ private def keepPushingBlocks() { logInfo("Started block pushing thread") + + def areBlocksBeingGenerated: Boolean = synchronized { + state != StoppedGeneratingBlocks + } + try { - while (!stopped) { - Option(blocksForPushing.poll(100, TimeUnit.MILLISECONDS)) match { + // While blocks are being generated, keep polling for to-be-pushed blocks and push them. + while (areBlocksBeingGenerated) { + Option(blocksForPushing.poll(10, TimeUnit.MILLISECONDS)) match { case Some(block) => pushBlock(block) case None => } } - // Push out the blocks that are still left + + // At this point, state is StoppedGeneratingBlock. So drain the queue of to-be-pushed blocks. logInfo("Pushing out the last " + blocksForPushing.size() + " blocks") while (!blocksForPushing.isEmpty) { - logDebug("Getting block ") val block = blocksForPushing.take() + logDebug(s"Pushing block $block") pushBlock(block) logInfo("Blocks left to push " + blocksForPushing.size()) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala index f663def4c051..bca1fbc8fda2 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala @@ -45,8 +45,7 @@ private[receiver] abstract class RateLimiter(conf: SparkConf) extends Logging { /** * Return the current rate limit. If no limit has been set so far, it returns {{{Long.MaxValue}}}. */ - def getCurrentLimit: Long = - rateLimiter.getRate.toLong + def getCurrentLimit: Long = rateLimiter.getRate.toLong /** * Set the rate limit to `newRate`. The new rate will not exceed the maximum rate configured by diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala index c8dd6e06812d..5f6c5b024085 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala @@ -222,7 +222,7 @@ private[streaming] object WriteAheadLogBasedBlockHandler { /** * A utility that will wrap the Iterator to get the count */ -private class CountingIterator[T](iterator: Iterator[T]) extends Iterator[T] { +private[streaming] class CountingIterator[T](iterator: Iterator[T]) extends Iterator[T] { private var _count = 0 private def isFullyConsumed: Boolean = !iterator.hasNext diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala index 7504fa44d9fa..554aae0117b2 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala @@ -116,12 +116,12 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * being pushed into Spark's memory. */ def store(dataItem: T) { - executor.pushSingle(dataItem) + supervisor.pushSingle(dataItem) } /** Store an ArrayBuffer of received data as a data block into Spark's memory. */ def store(dataBuffer: ArrayBuffer[T]) { - executor.pushArrayBuffer(dataBuffer, None, None) + supervisor.pushArrayBuffer(dataBuffer, None, None) } /** @@ -130,12 +130,12 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * for being used in the corresponding InputDStream. */ def store(dataBuffer: ArrayBuffer[T], metadata: Any) { - executor.pushArrayBuffer(dataBuffer, Some(metadata), None) + supervisor.pushArrayBuffer(dataBuffer, Some(metadata), None) } /** Store an iterator of received data as a data block into Spark's memory. */ def store(dataIterator: Iterator[T]) { - executor.pushIterator(dataIterator, None, None) + supervisor.pushIterator(dataIterator, None, None) } /** @@ -144,12 +144,12 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * for being used in the corresponding InputDStream. */ def store(dataIterator: java.util.Iterator[T], metadata: Any) { - executor.pushIterator(dataIterator, Some(metadata), None) + supervisor.pushIterator(dataIterator, Some(metadata), None) } /** Store an iterator of received data as a data block into Spark's memory. */ def store(dataIterator: java.util.Iterator[T]) { - executor.pushIterator(dataIterator, None, None) + supervisor.pushIterator(dataIterator, None, None) } /** @@ -158,7 +158,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * for being used in the corresponding InputDStream. */ def store(dataIterator: Iterator[T], metadata: Any) { - executor.pushIterator(dataIterator, Some(metadata), None) + supervisor.pushIterator(dataIterator, Some(metadata), None) } /** @@ -167,7 +167,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * that Spark is configured to use. */ def store(bytes: ByteBuffer) { - executor.pushBytes(bytes, None, None) + supervisor.pushBytes(bytes, None, None) } /** @@ -176,12 +176,12 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * for being used in the corresponding InputDStream. */ def store(bytes: ByteBuffer, metadata: Any) { - executor.pushBytes(bytes, Some(metadata), None) + supervisor.pushBytes(bytes, Some(metadata), None) } /** Report exceptions in receiving data. */ def reportError(message: String, throwable: Throwable) { - executor.reportError(message, throwable) + supervisor.reportError(message, throwable) } /** @@ -193,7 +193,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * The `message` will be reported to the driver. */ def restart(message: String) { - executor.restartReceiver(message) + supervisor.restartReceiver(message) } /** @@ -205,7 +205,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * The `message` and `exception` will be reported to the driver. */ def restart(message: String, error: Throwable) { - executor.restartReceiver(message, Some(error)) + supervisor.restartReceiver(message, Some(error)) } /** @@ -215,22 +215,22 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * in a background thread. */ def restart(message: String, error: Throwable, millisecond: Int) { - executor.restartReceiver(message, Some(error), millisecond) + supervisor.restartReceiver(message, Some(error), millisecond) } /** Stop the receiver completely. */ def stop(message: String) { - executor.stop(message, None) + supervisor.stop(message, None) } /** Stop the receiver completely due to an exception */ def stop(message: String, error: Throwable) { - executor.stop(message, Some(error)) + supervisor.stop(message, Some(error)) } /** Check if the receiver has started or not. */ def isStarted(): Boolean = { - executor.isReceiverStarted() + supervisor.isReceiverStarted() } /** @@ -238,7 +238,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * the receiving of data should be stopped. */ def isStopped(): Boolean = { - executor.isReceiverStopped() + supervisor.isReceiverStopped() } /** @@ -257,7 +257,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable private var id: Int = -1 /** Handler object that runs the receiver. This is instantiated lazily in the worker. */ - private[streaming] var executor_ : ReceiverSupervisor = null + @transient private var _supervisor : ReceiverSupervisor = null /** Set the ID of the DStream that this receiver is associated with. */ private[streaming] def setReceiverId(id_ : Int) { @@ -265,15 +265,17 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable } /** Attach Network Receiver executor to this receiver. */ - private[streaming] def attachExecutor(exec: ReceiverSupervisor) { - assert(executor_ == null) - executor_ = exec + private[streaming] def attachSupervisor(exec: ReceiverSupervisor) { + assert(_supervisor == null) + _supervisor = exec } - /** Get the attached executor. */ - private def executor: ReceiverSupervisor = { - assert(executor_ != null, "Executor has not been attached to this receiver") - executor_ + /** Get the attached supervisor. */ + private[streaming] def supervisor: ReceiverSupervisor = { + assert(_supervisor != null, + "A ReceiverSupervisor have not been attached to the receiver yet. Maybe you are starting " + + "some computation in the receiver before the Receiver.onStart() has been called.") + _supervisor } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala index e98017a63756..158d1ba2f183 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala @@ -44,8 +44,8 @@ private[streaming] abstract class ReceiverSupervisor( } import ReceiverState._ - // Attach the executor to the receiver - receiver.attachExecutor(this) + // Attach the supervisor to the receiver + receiver.attachSupervisor(this) private val futureExecutionContext = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonCachedThreadPool("receiver-supervisor-future", 128)) @@ -60,7 +60,7 @@ private[streaming] abstract class ReceiverSupervisor( private val defaultRestartDelay = conf.getInt("spark.streaming.receiverRestartDelay", 2000) /** The current maximum rate limit for this receiver. */ - private[streaming] def getCurrentRateLimit: Option[Long] = None + private[streaming] def getCurrentRateLimit: Long = Long.MaxValue /** Exception associated with the stopping of the receiver */ @volatile protected var stoppingError: Throwable = null @@ -92,13 +92,30 @@ private[streaming] abstract class ReceiverSupervisor( optionalBlockId: Option[StreamBlockId] ) + /** + * Create a custom [[BlockGenerator]] that the receiver implementation can directly control + * using their provided [[BlockGeneratorListener]]. + * + * Note: Do not explicitly start or stop the `BlockGenerator`, the `ReceiverSupervisorImpl` + * will take care of it. + */ + def createBlockGenerator(blockGeneratorListener: BlockGeneratorListener): BlockGenerator + /** Report errors. */ def reportError(message: String, throwable: Throwable) - /** Called when supervisor is started */ + /** + * Called when supervisor is started. + * Note that this must be called before the receiver.onStart() is called to ensure + * things like [[BlockGenerator]]s are started before the receiver starts sending data. + */ protected def onStart() { } - /** Called when supervisor is stopped */ + /** + * Called when supervisor is stopped. + * Note that this must be called after the receiver.onStop() is called to ensure + * things like [[BlockGenerator]]s are cleaned up after the receiver stops sending data. + */ protected def onStop(message: String, error: Option[Throwable]) { } /** Called when receiver is started. Return true if the driver accepts us */ diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala index 0d802f83549a..59ef58d232ee 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala @@ -20,6 +20,7 @@ package org.apache.spark.streaming.receiver import java.nio.ByteBuffer import java.util.concurrent.atomic.AtomicLong +import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import com.google.common.base.Throwables @@ -81,15 +82,20 @@ private[streaming] class ReceiverSupervisorImpl( cleanupOldBlocks(threshTime) case UpdateRateLimit(eps) => logInfo(s"Received a new rate limit: $eps.") - blockGenerator.updateRate(eps) + registeredBlockGenerators.foreach { bg => + bg.updateRate(eps) + } } }) /** Unique block ids if one wants to add blocks directly */ private val newBlockId = new AtomicLong(System.currentTimeMillis()) + private val registeredBlockGenerators = new mutable.ArrayBuffer[BlockGenerator] + with mutable.SynchronizedBuffer[BlockGenerator] + /** Divides received data records into data blocks for pushing in BlockManager. */ - private val blockGenerator = new BlockGenerator(new BlockGeneratorListener { + private val defaultBlockGeneratorListener = new BlockGeneratorListener { def onAddData(data: Any, metadata: Any): Unit = { } def onGenerateBlock(blockId: StreamBlockId): Unit = { } @@ -101,14 +107,15 @@ private[streaming] class ReceiverSupervisorImpl( def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]) { pushArrayBuffer(arrayBuffer, None, Some(blockId)) } - }, streamId, env.conf) + } + private val defaultBlockGenerator = createBlockGenerator(defaultBlockGeneratorListener) - override private[streaming] def getCurrentRateLimit: Option[Long] = - Some(blockGenerator.getCurrentLimit) + /** Get the current rate limit of the default block generator */ + override private[streaming] def getCurrentRateLimit: Long = defaultBlockGenerator.getCurrentLimit /** Push a single record of received data into block generator. */ def pushSingle(data: Any) { - blockGenerator.addData(data) + defaultBlockGenerator.addData(data) } /** Store an ArrayBuffer of received data as a data block into Spark's memory. */ @@ -162,11 +169,11 @@ private[streaming] class ReceiverSupervisorImpl( } override protected def onStart() { - blockGenerator.start() + registeredBlockGenerators.foreach { _.start() } } override protected def onStop(message: String, error: Option[Throwable]) { - blockGenerator.stop() + registeredBlockGenerators.foreach { _.stop() } env.rpcEnv.stop(endpoint) } @@ -183,6 +190,16 @@ private[streaming] class ReceiverSupervisorImpl( logInfo("Stopped receiver " + streamId) } + override def createBlockGenerator( + blockGeneratorListener: BlockGeneratorListener): BlockGenerator = { + // Cleanup BlockGenerators that have already been stopped + registeredBlockGenerators --= registeredBlockGenerators.filter{ _.isStopped() } + + val newBlockGenerator = new BlockGenerator(blockGeneratorListener, streamId, env.conf) + registeredBlockGenerators += newBlockGenerator + newBlockGenerator + } + /** Generate new block ID */ private def nextBlockId = StreamBlockId(streamId, newBlockId.getAndIncrement) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala index 363c03d431f0..deb15d075975 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala @@ -66,7 +66,7 @@ private[streaming] class InputInfoTracker(ssc: StreamingContext) extends Logging new mutable.HashMap[Int, StreamInputInfo]()) if (inputInfos.contains(inputInfo.inputStreamId)) { - throw new IllegalStateException(s"Input stream ${inputInfo.inputStreamId}} for batch" + + throw new IllegalStateException(s"Input stream ${inputInfo.inputStreamId} for batch" + s"$batchTime is already added into InputInfoTracker, this is a illegal state") } inputInfos += ((inputInfo.inputStreamId, inputInfo)) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala index 9f2117ada61c..2de035d166e7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala @@ -79,6 +79,10 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { def start(): Unit = synchronized { if (eventLoop != null) return // generator has already been started + // Call checkpointWriter here to initialize it before eventLoop uses it to avoid a deadlock. + // See SPARK-10125 + checkpointWriter + eventLoop = new EventLoop[JobGeneratorEvent]("JobGenerator") { override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala index 58bdda7794bf..6d4cdc4aa6b1 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala @@ -17,7 +17,7 @@ package org.apache.spark.streaming.scheduler -import java.util.concurrent.{TimeUnit, ConcurrentHashMap, Executors} +import java.util.concurrent.{ConcurrentHashMap, TimeUnit} import scala.collection.JavaConversions._ import scala.util.{Failure, Success} @@ -25,7 +25,7 @@ import scala.util.{Failure, Success} import org.apache.spark.Logging import org.apache.spark.rdd.PairRDDFunctions import org.apache.spark.streaming._ -import org.apache.spark.util.EventLoop +import org.apache.spark.util.{EventLoop, ThreadUtils} private[scheduler] sealed trait JobSchedulerEvent @@ -40,9 +40,12 @@ private[scheduler] case class ErrorReported(msg: String, e: Throwable) extends J private[streaming] class JobScheduler(val ssc: StreamingContext) extends Logging { - private val jobSets = new ConcurrentHashMap[Time, JobSet] + // Use of ConcurrentHashMap.keySet later causes an odd runtime problem due to Java 7/8 diff + // https://gist.github.com/AlainODea/1375759b8720a3f9f094 + private val jobSets: java.util.Map[Time, JobSet] = new ConcurrentHashMap[Time, JobSet] private val numConcurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs", 1) - private val jobExecutor = Executors.newFixedThreadPool(numConcurrentJobs) + private val jobExecutor = + ThreadUtils.newDaemonFixedThreadPool(numConcurrentJobs, "streaming-job-executor") private val jobGenerator = new JobGenerator(this) val clock = jobGenerator.clock val listenerBus = new StreamingListenerBus() @@ -191,14 +194,25 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString) ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString) try { - eventLoop.post(JobStarted(job)) - // Disable checks for existing output directories in jobs launched by the streaming - // scheduler, since we may need to write output to an existing directory during checkpoint - // recovery; see SPARK-4835 for more details. - PairRDDFunctions.disableOutputSpecValidation.withValue(true) { - job.run() + // We need to assign `eventLoop` to a temp variable. Otherwise, because + // `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then + // it's possible that when `post` is called, `eventLoop` happens to null. + var _eventLoop = eventLoop + if (_eventLoop != null) { + _eventLoop.post(JobStarted(job)) + // Disable checks for existing output directories in jobs launched by the streaming + // scheduler, since we may need to write output to an existing directory during checkpoint + // recovery; see SPARK-4835 for more details. + PairRDDFunctions.disableOutputSpecValidation.withValue(true) { + job.run() + } + _eventLoop = eventLoop + if (_eventLoop != null) { + _eventLoop.post(JobCompleted(job)) + } + } else { + // JobScheduler has been stopped. } - eventLoop.post(JobCompleted(job)) } finally { ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, null) ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, null) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala index 882ca0676b6a..a46c0c1b25e7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala @@ -76,9 +76,9 @@ private[streaming] abstract class RateController(val streamUID: Int, rateEstimat val elements = batchCompleted.batchInfo.streamIdToInputInfo for { - processingEnd <- batchCompleted.batchInfo.processingEndTime; - workDelay <- batchCompleted.batchInfo.processingDelay; - waitDelay <- batchCompleted.batchInfo.schedulingDelay; + processingEnd <- batchCompleted.batchInfo.processingEndTime + workDelay <- batchCompleted.batchInfo.processingDelay + waitDelay <- batchCompleted.batchInfo.schedulingDelay elems <- elements.get(streamUID).map(_.numRecords) } computeAndPublish(processingEnd, elems, workDelay, waitDelay) } @@ -86,5 +86,5 @@ private[streaming] abstract class RateController(val streamUID: Int, rateEstimat object RateController { def isBackPressureEnabled(conf: SparkConf): Boolean = - conf.getBoolean("spark.streaming.backpressure.enable", false) + conf.getBoolean("spark.streaming.backpressure.enabled", false) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala index 7720259a5d79..53b96d51c918 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.streaming.Time import org.apache.spark.streaming.util.{WriteAheadLog, WriteAheadLogUtils} import org.apache.spark.util.{Clock, Utils} -import org.apache.spark.{Logging, SparkConf, SparkException} +import org.apache.spark.{Logging, SparkConf} /** Trait representing any event in the ReceivedBlockTracker that updates its state. */ private[streaming] sealed trait ReceivedBlockTrackerLogEvent @@ -199,7 +199,8 @@ private[streaming] class ReceivedBlockTracker( import scala.collection.JavaConversions._ writeAheadLog.readAll().foreach { byteBuffer => logTrace("Recovering record " + byteBuffer) - Utils.deserialize[ReceivedBlockTrackerLogEvent](byteBuffer.array) match { + Utils.deserialize[ReceivedBlockTrackerLogEvent]( + byteBuffer.array, Thread.currentThread().getContextClassLoader) match { case BlockAdditionEvent(receivedBlockInfo) => insertAddedBlock(receivedBlockInfo) case BatchAllocationEvent(time, allocatedBlocks) => diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala index ef5b687b5831..10b5a7f57a80 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala @@ -22,6 +22,36 @@ import scala.collection.mutable import org.apache.spark.streaming.receiver.Receiver +/** + * A class that tries to schedule receivers with evenly distributed. There are two phases for + * scheduling receivers. + * + * - The first phase is global scheduling when ReceiverTracker is starting and we need to schedule + * all receivers at the same time. ReceiverTracker will call `scheduleReceivers` at this phase. + * It will try to schedule receivers with evenly distributed. ReceiverTracker should update its + * receiverTrackingInfoMap according to the results of `scheduleReceivers`. + * `ReceiverTrackingInfo.scheduledExecutors` for each receiver will set to an executor list that + * contains the scheduled locations. Then when a receiver is starting, it will send a register + * request and `ReceiverTracker.registerReceiver` will be called. In + * `ReceiverTracker.registerReceiver`, if a receiver's scheduled executors is set, it should check + * if the location of this receiver is one of the scheduled executors, if not, the register will + * be rejected. + * - The second phase is local scheduling when a receiver is restarting. There are two cases of + * receiver restarting: + * - If a receiver is restarting because it's rejected due to the real location and the scheduled + * executors mismatching, in other words, it fails to start in one of the locations that + * `scheduleReceivers` suggested, `ReceiverTracker` should firstly choose the executors that are + * still alive in the list of scheduled executors, then use them to launch the receiver job. + * - If a receiver is restarting without a scheduled executors list, or the executors in the list + * are dead, `ReceiverTracker` should call `rescheduleReceiver`. If so, `ReceiverTracker` should + * not set `ReceiverTrackingInfo.scheduledExecutors` for this executor, instead, it should clear + * it. Then when this receiver is registering, we can know this is a local scheduling, and + * `ReceiverTrackingInfo` should call `rescheduleReceiver` again to check if the launching + * location is matching. + * + * In conclusion, we should make a global schedule, try to achieve that exactly as long as possible, + * otherwise do local scheduling. + */ private[streaming] class ReceiverSchedulingPolicy { /** @@ -102,8 +132,7 @@ private[streaming] class ReceiverSchedulingPolicy { /** * Return a list of candidate executors to run the receiver. If the list is empty, the caller can - * run this receiver in arbitrary executor. The caller can use `preferredNumExecutors` to require - * returning `preferredNumExecutors` executors if possible. + * run this receiver in arbitrary executor. * * This method tries to balance executors' load. Here is the approach to schedule executors * for a receiver. @@ -122,9 +151,8 @@ private[streaming] class ReceiverSchedulingPolicy { * If a receiver is scheduled to an executor but has not yet run, it contributes * `1.0 / #candidate_executors_of_this_receiver` to the executor's weight. * - * At last, if there are more than `preferredNumExecutors` idle executors (weight = 0), - * returns all idle executors. Otherwise, we only return `preferredNumExecutors` best options - * according to the weights. + * At last, if there are any idle executors (weight = 0), returns all idle executors. + * Otherwise, returns the executors that have the minimum weight. * * * @@ -134,8 +162,7 @@ private[streaming] class ReceiverSchedulingPolicy { receiverId: Int, preferredLocation: Option[String], receiverTrackingInfoMap: Map[Int, ReceiverTrackingInfo], - executors: Seq[String], - preferredNumExecutors: Int = 3): Seq[String] = { + executors: Seq[String]): Seq[String] = { if (executors.isEmpty) { return Seq.empty } @@ -156,15 +183,18 @@ private[streaming] class ReceiverSchedulingPolicy { } }.groupBy(_._1).mapValues(_.map(_._2).sum) // Sum weights for each executor - val idleExecutors = (executors.toSet -- executorWeights.keys).toSeq - if (idleExecutors.size >= preferredNumExecutors) { - // If there are more than `preferredNumExecutors` idle executors, return all of them + val idleExecutors = executors.toSet -- executorWeights.keys + if (idleExecutors.nonEmpty) { scheduledExecutors ++= idleExecutors } else { - // If there are less than `preferredNumExecutors` idle executors, return 3 best options - scheduledExecutors ++= idleExecutors - val sortedExecutors = executorWeights.toSeq.sortBy(_._2).map(_._1) - scheduledExecutors ++= (idleExecutors ++ sortedExecutors).take(preferredNumExecutors) + // There is no idle executor. So select all executors that have the minimum weight. + val sortedExecutors = executorWeights.toSeq.sortBy(_._2) + if (sortedExecutors.nonEmpty) { + val minWeight = sortedExecutors(0)._2 + scheduledExecutors ++= sortedExecutors.takeWhile(_._2 == minWeight).map(_._1) + } else { + // This should not happen since "executors" is not empty + } } scheduledExecutors.toSeq } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index e076fb5ea174..f86fd44b4871 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -244,8 +244,21 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false } if (isTrackerStopping || isTrackerStopped) { - false - } else if (!scheduleReceiver(streamId).contains(hostPort)) { + return false + } + + val scheduledExecutors = receiverTrackingInfos(streamId).scheduledExecutors + val accetableExecutors = if (scheduledExecutors.nonEmpty) { + // This receiver is registering and it's scheduled by + // ReceiverSchedulingPolicy.scheduleReceivers. So use "scheduledExecutors" to check it. + scheduledExecutors.get + } else { + // This receiver is scheduled by "ReceiverSchedulingPolicy.rescheduleReceiver", so calling + // "ReceiverSchedulingPolicy.rescheduleReceiver" again to check it. + scheduleReceiver(streamId) + } + + if (!accetableExecutors.contains(hostPort)) { // Refuse it since it's scheduled to a wrong executor false } else { @@ -278,7 +291,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false ReceiverTrackingInfo( streamId, ReceiverState.INACTIVE, None, None, None, None, Some(errorInfo)) } - receiverTrackingInfos -= streamId + receiverTrackingInfos(streamId) = newReceiverTrackingInfo listenerBus.post(StreamingListenerReceiverStopped(newReceiverTrackingInfo.toReceiverInfo)) val messageWithError = if (error != null && !error.isEmpty) { s"$message - $error" @@ -426,12 +439,25 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false startReceiver(receiver, executors) } case RestartReceiver(receiver) => - val scheduledExecutors = schedulingPolicy.rescheduleReceiver( - receiver.streamId, - receiver.preferredLocation, - receiverTrackingInfos, - getExecutors) - updateReceiverScheduledExecutors(receiver.streamId, scheduledExecutors) + // Old scheduled executors minus the ones that are not active any more + val oldScheduledExecutors = getStoredScheduledExecutors(receiver.streamId) + val scheduledExecutors = if (oldScheduledExecutors.nonEmpty) { + // Try global scheduling again + oldScheduledExecutors + } else { + val oldReceiverInfo = receiverTrackingInfos(receiver.streamId) + // Clear "scheduledExecutors" to indicate we are going to do local scheduling + val newReceiverInfo = oldReceiverInfo.copy( + state = ReceiverState.INACTIVE, scheduledExecutors = None) + receiverTrackingInfos(receiver.streamId) = newReceiverInfo + schedulingPolicy.rescheduleReceiver( + receiver.streamId, + receiver.preferredLocation, + receiverTrackingInfos, + getExecutors) + } + // Assume there is one receiver restarting at one time, so we don't need to update + // receiverTrackingInfos startReceiver(receiver, scheduledExecutors) case c: CleanupOldBlocks => receiverTrackingInfos.values.flatMap(_.endpoint).foreach(_.send(c)) @@ -457,19 +483,42 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false context.reply(true) // Local messages case AllReceiverIds => - context.reply(receiverTrackingInfos.keys.toSeq) + context.reply(receiverTrackingInfos.filter(_._2.state != ReceiverState.INACTIVE).keys.toSeq) case StopAllReceivers => assert(isTrackerStopping || isTrackerStopped) stopReceivers() context.reply(true) } + /** + * Return the stored scheduled executors that are still alive. + */ + private def getStoredScheduledExecutors(receiverId: Int): Seq[String] = { + if (receiverTrackingInfos.contains(receiverId)) { + val scheduledExecutors = receiverTrackingInfos(receiverId).scheduledExecutors + if (scheduledExecutors.nonEmpty) { + val executors = getExecutors.toSet + // Only return the alive executors + scheduledExecutors.get.filter(executors) + } else { + Nil + } + } else { + Nil + } + } + /** * Start a receiver along with its scheduled executors */ private def startReceiver(receiver: Receiver[_], scheduledExecutors: Seq[String]): Unit = { + def shouldStartReceiver: Boolean = { + // It's okay to start when trackerState is Initialized or Started + !(isTrackerStopping || isTrackerStopped) + } + val receiverId = receiver.streamId - if (!isTrackerStarted) { + if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) return } @@ -479,7 +528,23 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false new SerializableConfiguration(ssc.sparkContext.hadoopConfiguration) // Function to start the receiver on the worker node - val startReceiverFunc = new StartReceiverFunc(checkpointDirOption, serializableHadoopConf) + val startReceiverFunc: Iterator[Receiver[_]] => Unit = + (iterator: Iterator[Receiver[_]]) => { + if (!iterator.hasNext) { + throw new SparkException( + "Could not start receiver as object not found.") + } + if (TaskContext.get().attemptNumber() == 0) { + val receiver = iterator.next() + assert(iterator.hasNext == false) + val supervisor = new ReceiverSupervisorImpl( + receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption) + supervisor.start() + supervisor.awaitTermination() + } else { + // It's restarted by TaskScheduler, but we want to reschedule it again. So exit it. + } + } // Create the RDD using the scheduledExecutors to run the receiver in a Spark job val receiverRDD: RDD[Receiver[_]] = @@ -494,14 +559,14 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false // We will keep restarting the receiver job until ReceiverTracker is stopped future.onComplete { case Success(_) => - if (!isTrackerStarted) { + if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) } else { logInfo(s"Restarting Receiver $receiverId") self.send(RestartReceiver(receiver)) } case Failure(e) => - if (!isTrackerStarted) { + if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) } else { logError("Receiver has been stopped. Try to restart it.", e) @@ -536,31 +601,3 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false } } - -/** - * Function to start the receiver on the worker node. Use a class instead of closure to avoid - * the serialization issue. - */ -private class StartReceiverFunc( - checkpointDirOption: Option[String], - serializableHadoopConf: SerializableConfiguration) - extends (Iterator[Receiver[_]] => Unit) with Serializable { - - override def apply(iterator: Iterator[Receiver[_]]): Unit = { - if (!iterator.hasNext) { - throw new SparkException( - "Could not start receiver as object not found.") - } - if (TaskContext.get().attemptNumber() == 0) { - val receiver = iterator.next() - assert(iterator.hasNext == false) - val supervisor = new ReceiverSupervisorImpl( - receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption) - supervisor.start() - supervisor.awaitTermination() - } else { - // It's restarted by TaskScheduler, but we want to reschedule it again. So exit it. - } - } - -} diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala index 6ae56a68ad88..84a3ca9d74e5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala @@ -17,6 +17,8 @@ package org.apache.spark.streaming.scheduler.rate +import org.apache.spark.Logging + /** * Implements a proportional-integral-derivative (PID) controller which acts on * the speed of ingestion of elements into Spark Streaming. A PID controller works @@ -26,7 +28,7 @@ package org.apache.spark.streaming.scheduler.rate * * @see https://en.wikipedia.org/wiki/PID_controller * - * @param batchDurationMillis the batch duration, in milliseconds + * @param batchIntervalMillis the batch duration, in milliseconds * @param proportional how much the correction should depend on the current * error. This term usually provides the bulk of correction and should be positive or zero. * A value too large would make the controller overshoot the setpoint, while a small value @@ -39,13 +41,17 @@ package org.apache.spark.streaming.scheduler.rate * of future errors, based on current rate of change. This value should be positive or 0. * This term is not used very often, as it impacts stability of the system. The default * value is 0. + * @param minRate what is the minimum rate that can be estimated. + * This must be greater than zero, so that the system always receives some data for rate + * estimation to work. */ private[streaming] class PIDRateEstimator( batchIntervalMillis: Long, - proportional: Double = 1D, - integral: Double = .2D, - derivative: Double = 0D) - extends RateEstimator { + proportional: Double, + integral: Double, + derivative: Double, + minRate: Double + ) extends RateEstimator with Logging { private var firstRun: Boolean = true private var latestTime: Long = -1L @@ -64,16 +70,23 @@ private[streaming] class PIDRateEstimator( require( derivative >= 0, s"Derivative term $derivative in PIDRateEstimator should be >= 0.") + require( + minRate > 0, + s"Minimum rate in PIDRateEstimator should be > 0") + logInfo(s"Created PIDRateEstimator with proportional = $proportional, integral = $integral, " + + s"derivative = $derivative, min rate = $minRate") - def compute(time: Long, // in milliseconds + def compute( + time: Long, // in milliseconds numElements: Long, processingDelay: Long, // in milliseconds schedulingDelay: Long // in milliseconds ): Option[Double] = { - + logTrace(s"\ntime = $time, # records = $numElements, " + + s"processing time = $processingDelay, scheduling delay = $schedulingDelay") this.synchronized { - if (time > latestTime && processingDelay > 0 && batchIntervalMillis > 0) { + if (time > latestTime && numElements > 0 && processingDelay > 0) { // in seconds, should be close to batchDuration val delaySinceUpdate = (time - latestTime).toDouble / 1000 @@ -104,21 +117,30 @@ private[streaming] class PIDRateEstimator( val newRate = (latestRate - proportional * error - integral * historicalError - - derivative * dError).max(0.0) + derivative * dError).max(minRate) + logTrace(s""" + | latestRate = $latestRate, error = $error + | latestError = $latestError, historicalError = $historicalError + | delaySinceUpdate = $delaySinceUpdate, dError = $dError + """.stripMargin) + latestTime = time if (firstRun) { latestRate = processingRate latestError = 0D firstRun = false - + logTrace("First run, rate estimation skipped") None } else { latestRate = newRate latestError = error - + logTrace(s"New rate = $newRate") Some(newRate) } - } else None + } else { + logTrace("Rate estimation skipped") + None + } } } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala index 17ccebc1ed41..d7210f64fcc3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala @@ -18,7 +18,6 @@ package org.apache.spark.streaming.scheduler.rate import org.apache.spark.SparkConf -import org.apache.spark.SparkException import org.apache.spark.streaming.Duration /** @@ -61,7 +60,8 @@ object RateEstimator { val proportional = conf.getDouble("spark.streaming.backpressure.pid.proportional", 1.0) val integral = conf.getDouble("spark.streaming.backpressure.pid.integral", 0.2) val derived = conf.getDouble("spark.streaming.backpressure.pid.derived", 0.0) - new PIDRateEstimator(batchInterval.milliseconds, proportional, integral, derived) + val minRate = conf.getDouble("spark.streaming.backpressure.pid.minRate", 100) + new PIDRateEstimator(batchInterval.milliseconds, proportional, integral, derived, minRate) case estimator => throw new IllegalArgumentException(s"Unkown rate estimator: $estimator") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala index 0c891662c264..90d1b0fadecf 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala @@ -28,7 +28,7 @@ import org.apache.spark.ui.{UIUtils => SparkUIUtils, WebUIPage} import org.apache.spark.streaming.ui.StreamingJobProgressListener.{SparkJobId, OutputOpId} import org.apache.spark.ui.jobs.UIData.JobUIData -private case class SparkJobIdWithUIData(sparkJobId: SparkJobId, jobUIData: Option[JobUIData]) +private[ui] case class SparkJobIdWithUIData(sparkJobId: SparkJobId, jobUIData: Option[JobUIData]) private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") { private val streamingListener = parent.listener diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala index b77c555c68b8..78aeb004e18b 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala @@ -148,6 +148,14 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext) receiverInfos.size } + def numActiveReceivers: Int = synchronized { + receiverInfos.count(_._2.active) + } + + def numInactiveReceivers: Int = { + ssc.graph.getReceiverInputStreams().size - numActiveReceivers + } + def numTotalCompletedBatches: Long = synchronized { totalCompletedBatches } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala index 87af902428ec..96d943e75d27 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala @@ -303,6 +303,7 @@ private[ui] class StreamingPage(parent: StreamingTab) val numCompletedBatches = listener.retainedCompletedBatches.size val numActiveBatches = batchTimes.length - numCompletedBatches + val numReceivers = listener.numInactiveReceivers + listener.numActiveReceivers val table = // scalastyle:off
Property NameDefaultMeaning
spark.sql.hive.metastore.version0.13.11.2.1 Version of the Hive metastore. Available - options are 0.12.0 and 0.13.1. Support for more versions is coming in the future. + options are 0.12.0 through 1.2.1.
spark.sql.codegenspark.sql.tungsten.enabled true - When true, code will be dynamically generated at runtime for expression evaluation in a specific - query. For some queries with complicated expression this option can lead to significant speed-ups. + When true, use the optimized Tungsten physical execution backend which explicitly manages memory + and dynamically generates bytecode for expression evaluation.
+ {executionUIData.executionId.toString} + + {descriptionCell(executionUIData)} + + {UIUtils.formatDate(submissionTime)} + + {UIUtils.formatDuration(duration)} + + {runningJobs} + + {succeededJobs} + + {failedJobs} +
{summary}{details}
{session.userName} {session.ip}
@@ -330,6 +331,11 @@ private[ui] class StreamingPage(parent: StreamingTab) } } + { + if (numReceivers > 0) { +
Receivers: {listener.numActiveReceivers} / {numReceivers} active
+ } + }
Avg: {eventRateForAllStreams.formattedAvg} events/sec
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaWriteAheadLogSuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaWriteAheadLogSuite.java index 50e8f9fc159c..175b8a496b4e 100644 --- a/streaming/src/test/java/org/apache/spark/streaming/JavaWriteAheadLogSuite.java +++ b/streaming/src/test/java/org/apache/spark/streaming/JavaWriteAheadLogSuite.java @@ -17,13 +17,15 @@ package org.apache.spark.streaming; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.nio.ByteBuffer; import java.util.Arrays; -import java.util.Collection; +import java.util.Iterator; +import java.util.List; -import org.apache.commons.collections.CollectionUtils; -import org.apache.commons.collections.Transformer; +import com.google.common.base.Function; +import com.google.common.collect.Iterators; import org.apache.spark.SparkConf; import org.apache.spark.streaming.util.WriteAheadLog; import org.apache.spark.streaming.util.WriteAheadLogRecordHandle; @@ -32,40 +34,40 @@ import org.junit.Test; import org.junit.Assert; -class JavaWriteAheadLogSuiteHandle extends WriteAheadLogRecordHandle { - int index = -1; - public JavaWriteAheadLogSuiteHandle(int idx) { - index = idx; - } -} - public class JavaWriteAheadLogSuite extends WriteAheadLog { - class Record { + static class JavaWriteAheadLogSuiteHandle extends WriteAheadLogRecordHandle { + int index = -1; + JavaWriteAheadLogSuiteHandle(int idx) { + index = idx; + } + } + + static class Record { long time; int index; ByteBuffer buffer; - public Record(long tym, int idx, ByteBuffer buf) { + Record(long tym, int idx, ByteBuffer buf) { index = idx; time = tym; buffer = buf; } } private int index = -1; - private ArrayList records = new ArrayList(); + private final List records = new ArrayList<>(); // Methods for WriteAheadLog @Override - public WriteAheadLogRecordHandle write(java.nio.ByteBuffer record, long time) { + public WriteAheadLogRecordHandle write(ByteBuffer record, long time) { index += 1; - records.add(new org.apache.spark.streaming.JavaWriteAheadLogSuite.Record(time, index, record)); + records.add(new Record(time, index, record)); return new JavaWriteAheadLogSuiteHandle(index); } @Override - public java.nio.ByteBuffer read(WriteAheadLogRecordHandle handle) { + public ByteBuffer read(WriteAheadLogRecordHandle handle) { if (handle instanceof JavaWriteAheadLogSuiteHandle) { int reqdIndex = ((JavaWriteAheadLogSuiteHandle) handle).index; for (Record record: records) { @@ -78,14 +80,13 @@ public java.nio.ByteBuffer read(WriteAheadLogRecordHandle handle) { } @Override - public java.util.Iterator readAll() { - Collection buffers = CollectionUtils.collect(records, new Transformer() { + public Iterator readAll() { + return Iterators.transform(records.iterator(), new Function() { @Override - public Object transform(Object input) { - return ((Record) input).buffer; + public ByteBuffer apply(Record input) { + return input.buffer; } }); - return buffers.iterator(); } @Override @@ -110,20 +111,21 @@ public void testCustomWAL() { WriteAheadLog wal = WriteAheadLogUtils.createLogForDriver(conf, null, null); String data1 = "data1"; - WriteAheadLogRecordHandle handle = wal.write(ByteBuffer.wrap(data1.getBytes()), 1234); + WriteAheadLogRecordHandle handle = + wal.write(ByteBuffer.wrap(data1.getBytes(StandardCharsets.UTF_8)), 1234); Assert.assertTrue(handle instanceof JavaWriteAheadLogSuiteHandle); - Assert.assertTrue(new String(wal.read(handle).array()).equals(data1)); + Assert.assertEquals(new String(wal.read(handle).array(), StandardCharsets.UTF_8), data1); - wal.write(ByteBuffer.wrap("data2".getBytes()), 1235); - wal.write(ByteBuffer.wrap("data3".getBytes()), 1236); - wal.write(ByteBuffer.wrap("data4".getBytes()), 1237); + wal.write(ByteBuffer.wrap("data2".getBytes(StandardCharsets.UTF_8)), 1235); + wal.write(ByteBuffer.wrap("data3".getBytes(StandardCharsets.UTF_8)), 1236); + wal.write(ByteBuffer.wrap("data4".getBytes(StandardCharsets.UTF_8)), 1237); wal.clean(1236, false); - java.util.Iterator dataIterator = wal.readAll(); - ArrayList readData = new ArrayList(); + Iterator dataIterator = wal.readAll(); + List readData = new ArrayList<>(); while (dataIterator.hasNext()) { - readData.add(new String(dataIterator.next().array())); + readData.add(new String(dataIterator.next().array(), StandardCharsets.UTF_8)); } - Assert.assertTrue(readData.equals(Arrays.asList("data3", "data4"))); + Assert.assertEquals(readData, Arrays.asList("data3", "data4")); } } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index 67c2d900940a..1bba7a143edf 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.streaming import java.io.File -import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} +import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer} import scala.reflect.ClassTag import com.google.common.base.Charsets @@ -33,7 +33,7 @@ import org.scalatest.concurrent.Eventually._ import org.scalatest.time.SpanSugar._ import org.apache.spark.streaming.dstream.{DStream, FileInputDStream} -import org.apache.spark.streaming.scheduler.{RateLimitInputDStream, ConstantEstimator, SingletonTestRateReceiver} +import org.apache.spark.streaming.scheduler.{ConstantEstimator, RateTestInputDStream, RateTestReceiver} import org.apache.spark.util.{Clock, ManualClock, Utils} /** @@ -397,26 +397,24 @@ class CheckpointSuite extends TestSuiteBase { ssc = new StreamingContext(conf, batchDuration) ssc.checkpoint(checkpointDir) - val dstream = new RateLimitInputDStream(ssc) { + val dstream = new RateTestInputDStream(ssc) { override val rateController = - Some(new ReceiverRateController(id, new ConstantEstimator(200.0))) + Some(new ReceiverRateController(id, new ConstantEstimator(200))) } - SingletonTestRateReceiver.reset() val output = new TestOutputStreamWithPartitions(dstream.checkpoint(batchDuration * 2)) output.register() runStreams(ssc, 5, 5) - SingletonTestRateReceiver.reset() ssc = new StreamingContext(checkpointDir) ssc.start() val outputNew = advanceTimeWithRealDelay(ssc, 2) - eventually(timeout(5.seconds)) { - assert(dstream.getCurrentRateLimit === Some(200)) + eventually(timeout(10.seconds)) { + assert(RateTestReceiver.getActive().nonEmpty) + assert(RateTestReceiver.getActive().get.getDefaultBlockGeneratorRateLimit() === 200) } ssc.stop() - ssc = null } // This tests whether file input stream remembers what files were seen before diff --git a/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala index 0c4c06534a69..e82c2fa4e72a 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala @@ -17,25 +17,32 @@ package org.apache.spark.streaming -import org.apache.spark.Logging +import java.io.File + +import org.scalatest.BeforeAndAfter + +import org.apache.spark.{SparkFunSuite, Logging} import org.apache.spark.util.Utils /** * This testsuite tests master failures at random times while the stream is running using * the real clock. */ -class FailureSuite extends TestSuiteBase with Logging { +class FailureSuite extends SparkFunSuite with BeforeAndAfter with Logging { - val directory = Utils.createTempDir() - val numBatches = 30 + private val batchDuration: Duration = Milliseconds(1000) + private val numBatches = 30 + private var directory: File = null - override def batchDuration: Duration = Milliseconds(1000) - - override def useManualClock: Boolean = false + before { + directory = Utils.createTempDir() + } - override def afterFunction() { - Utils.deleteRecursively(directory) - super.afterFunction() + after { + if (directory != null) { + Utils.deleteRecursively(directory) + } + StreamingContext.getActive().foreach { _.stop() } } test("multiple failures with map") { diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala index ec2852d9a020..047e38ef9099 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala @@ -76,6 +76,9 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { fail("Timeout: cannot finish all batches in 30 seconds") } + // Ensure progress listener has been notified of all events + ssc.scheduler.listenerBus.waitUntilEmpty(500) + // Verify all "InputInfo"s have been reported assert(ssc.progressListener.numTotalReceivedRecords === input.size) assert(ssc.progressListener.numTotalProcessedRecords === input.size) diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala new file mode 100644 index 000000000000..6d388d9624d9 --- /dev/null +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming + +import scala.util.Random + +import org.scalatest.BeforeAndAfterAll + +import org.apache.spark.rdd.BlockRDD +import org.apache.spark.storage.{StorageLevel, StreamBlockId} +import org.apache.spark.streaming.dstream.ReceiverInputDStream +import org.apache.spark.streaming.rdd.WriteAheadLogBackedBlockRDD +import org.apache.spark.streaming.receiver.{BlockManagerBasedStoreResult, Receiver, WriteAheadLogBasedStoreResult} +import org.apache.spark.streaming.scheduler.ReceivedBlockInfo +import org.apache.spark.streaming.util.{WriteAheadLogRecordHandle, WriteAheadLogUtils} +import org.apache.spark.{SparkConf, SparkEnv} + +class ReceiverInputDStreamSuite extends TestSuiteBase with BeforeAndAfterAll { + + override def afterAll(): Unit = { + StreamingContext.getActive().map { _.stop() } + } + + testWithoutWAL("createBlockRDD creates empty BlockRDD when no block info") { receiverStream => + val rdd = receiverStream.createBlockRDD(Time(0), Seq.empty) + assert(rdd.isInstanceOf[BlockRDD[_]]) + assert(!rdd.isInstanceOf[WriteAheadLogBackedBlockRDD[_]]) + assert(rdd.isEmpty()) + } + + testWithoutWAL("createBlockRDD creates correct BlockRDD with block info") { receiverStream => + val blockInfos = Seq.fill(5) { createBlockInfo(withWALInfo = false) } + val blockIds = blockInfos.map(_.blockId) + + // Verify that there are some blocks that are present, and some that are not + require(blockIds.forall(blockId => SparkEnv.get.blockManager.master.contains(blockId))) + + val rdd = receiverStream.createBlockRDD(Time(0), blockInfos) + assert(rdd.isInstanceOf[BlockRDD[_]]) + assert(!rdd.isInstanceOf[WriteAheadLogBackedBlockRDD[_]]) + val blockRDD = rdd.asInstanceOf[BlockRDD[_]] + assert(blockRDD.blockIds.toSeq === blockIds) + } + + testWithoutWAL("createBlockRDD filters non-existent blocks before creating BlockRDD") { + receiverStream => + val presentBlockInfos = Seq.fill(2)(createBlockInfo(withWALInfo = false, createBlock = true)) + val absentBlockInfos = Seq.fill(3)(createBlockInfo(withWALInfo = false, createBlock = false)) + val blockInfos = presentBlockInfos ++ absentBlockInfos + val blockIds = blockInfos.map(_.blockId) + + // Verify that there are some blocks that are present, and some that are not + require(blockIds.exists(blockId => SparkEnv.get.blockManager.master.contains(blockId))) + require(blockIds.exists(blockId => !SparkEnv.get.blockManager.master.contains(blockId))) + + val rdd = receiverStream.createBlockRDD(Time(0), blockInfos) + assert(rdd.isInstanceOf[BlockRDD[_]]) + val blockRDD = rdd.asInstanceOf[BlockRDD[_]] + assert(blockRDD.blockIds.toSeq === presentBlockInfos.map { _.blockId}) + } + + testWithWAL("createBlockRDD creates empty WALBackedBlockRDD when no block info") { + receiverStream => + val rdd = receiverStream.createBlockRDD(Time(0), Seq.empty) + assert(rdd.isInstanceOf[WriteAheadLogBackedBlockRDD[_]]) + assert(rdd.isEmpty()) + } + + testWithWAL( + "createBlockRDD creates correct WALBackedBlockRDD with all block info having WAL info") { + receiverStream => + val blockInfos = Seq.fill(5) { createBlockInfo(withWALInfo = true) } + val blockIds = blockInfos.map(_.blockId) + val rdd = receiverStream.createBlockRDD(Time(0), blockInfos) + assert(rdd.isInstanceOf[WriteAheadLogBackedBlockRDD[_]]) + val blockRDD = rdd.asInstanceOf[WriteAheadLogBackedBlockRDD[_]] + assert(blockRDD.blockIds.toSeq === blockIds) + assert(blockRDD.walRecordHandles.toSeq === blockInfos.map { _.walRecordHandleOption.get }) + } + + testWithWAL("createBlockRDD creates BlockRDD when some block info dont have WAL info") { + receiverStream => + val blockInfos1 = Seq.fill(2) { createBlockInfo(withWALInfo = true) } + val blockInfos2 = Seq.fill(3) { createBlockInfo(withWALInfo = false) } + val blockInfos = blockInfos1 ++ blockInfos2 + val blockIds = blockInfos.map(_.blockId) + val rdd = receiverStream.createBlockRDD(Time(0), blockInfos) + assert(rdd.isInstanceOf[BlockRDD[_]]) + val blockRDD = rdd.asInstanceOf[BlockRDD[_]] + assert(blockRDD.blockIds.toSeq === blockIds) + } + + + private def testWithoutWAL(msg: String)(body: ReceiverInputDStream[_] => Unit): Unit = { + test(s"Without WAL enabled: $msg") { + runTest(enableWAL = false, body) + } + } + + private def testWithWAL(msg: String)(body: ReceiverInputDStream[_] => Unit): Unit = { + test(s"With WAL enabled: $msg") { + runTest(enableWAL = true, body) + } + } + + private def runTest(enableWAL: Boolean, body: ReceiverInputDStream[_] => Unit): Unit = { + val conf = new SparkConf() + conf.setMaster("local[4]").setAppName("ReceiverInputDStreamSuite") + conf.set(WriteAheadLogUtils.RECEIVER_WAL_ENABLE_CONF_KEY, enableWAL.toString) + require(WriteAheadLogUtils.enableReceiverLog(conf) === enableWAL) + val ssc = new StreamingContext(conf, Seconds(1)) + val receiverStream = new ReceiverInputDStream[Int](ssc) { + override def getReceiver(): Receiver[Int] = null + } + withStreamingContext(ssc) { ssc => + body(receiverStream) + } + } + + /** + * Create a block info for input to the ReceiverInputDStream.createBlockRDD + * @param withWALInfo Create block with WAL info in it + * @param createBlock Actually create the block in the BlockManager + * @return + */ + private def createBlockInfo( + withWALInfo: Boolean, + createBlock: Boolean = true): ReceivedBlockInfo = { + val blockId = new StreamBlockId(0, Random.nextLong()) + if (createBlock) { + SparkEnv.get.blockManager.putSingle(blockId, 1, StorageLevel.MEMORY_ONLY, tellMaster = true) + require(SparkEnv.get.blockManager.master.contains(blockId)) + } + val storeResult = if (withWALInfo) { + new WriteAheadLogBasedStoreResult(blockId, None, new WriteAheadLogRecordHandle { }) + } else { + new BlockManagerBasedStoreResult(blockId, None) + } + new ReceivedBlockInfo(0, None, None, storeResult) + } +} diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala index 13b4d17c8618..01279b34f73d 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala @@ -129,32 +129,6 @@ class ReceiverSuite extends TestSuiteBase with Timeouts with Serializable { } } - test("block generator") { - val blockGeneratorListener = new FakeBlockGeneratorListener - val blockIntervalMs = 200 - val conf = new SparkConf().set("spark.streaming.blockInterval", s"${blockIntervalMs}ms") - val blockGenerator = new BlockGenerator(blockGeneratorListener, 1, conf) - val expectedBlocks = 5 - val waitTime = expectedBlocks * blockIntervalMs + (blockIntervalMs / 2) - val generatedData = new ArrayBuffer[Int] - - // Generate blocks - val startTime = System.currentTimeMillis() - blockGenerator.start() - var count = 0 - while(System.currentTimeMillis - startTime < waitTime) { - blockGenerator.addData(count) - generatedData += count - count += 1 - Thread.sleep(10) - } - blockGenerator.stop() - - val recordedData = blockGeneratorListener.arrayBuffers.flatten - assert(blockGeneratorListener.arrayBuffers.size > 0) - assert(recordedData.toSet === generatedData.toSet) - } - ignore("block generator throttling") { val blockGeneratorListener = new FakeBlockGeneratorListener val blockIntervalMs = 100 @@ -348,6 +322,11 @@ class ReceiverSuite extends TestSuiteBase with Timeouts with Serializable { } override protected def onReceiverStart(): Boolean = true + + override def createBlockGenerator( + blockGeneratorListener: BlockGeneratorListener): BlockGenerator = { + null + } } /** diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala index b7db280f6358..d26894e88fc2 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala @@ -30,7 +30,7 @@ import org.scalatest.concurrent.Timeouts import org.scalatest.exceptions.TestFailedDueToTimeoutException import org.scalatest.time.SpanSugar._ -import org.apache.spark.{Logging, SparkConf, SparkContext, SparkFunSuite} +import org.apache.spark._ import org.apache.spark.metrics.MetricsSystem import org.apache.spark.metrics.source.Source import org.apache.spark.storage.StorageLevel @@ -726,16 +726,26 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo } test("queueStream doesn't support checkpointing") { - val checkpointDir = Utils.createTempDir() - ssc = new StreamingContext(master, appName, batchDuration) - val rdd = ssc.sparkContext.parallelize(1 to 10) - ssc.queueStream[Int](Queue(rdd)).print() - ssc.checkpoint(checkpointDir.getAbsolutePath) - val e = intercept[NotSerializableException] { - ssc.start() + val checkpointDirectory = Utils.createTempDir().getAbsolutePath() + def creatingFunction(): StreamingContext = { + val _ssc = new StreamingContext(conf, batchDuration) + val rdd = _ssc.sparkContext.parallelize(1 to 10) + _ssc.checkpoint(checkpointDirectory) + _ssc.queueStream[Int](Queue(rdd)).register() + _ssc + } + ssc = StreamingContext.getOrCreate(checkpointDirectory, creatingFunction _) + ssc.start() + eventually(timeout(10000 millis)) { + assert(Checkpoint.getCheckpointFiles(checkpointDirectory).size > 1) + } + ssc.stop() + val e = intercept[SparkException] { + ssc = StreamingContext.getOrCreate(checkpointDirectory, creatingFunction _) } // StreamingContext.validate changes the message, so use "contains" here - assert(e.getMessage.contains("queueStream doesn't support checkpointing")) + assert(e.getCause.getMessage.contains("queueStream doesn't support checkpointing. " + + "Please don't use queueStream when checkpointing is enabled.")) } def addInputStream(s: StreamingContext): DStream[Int] = { @@ -789,7 +799,8 @@ class TestReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) with Logging } def onStop() { - // no clean to be done, the receiving thread should stop on it own + // no clean to be done, the receiving thread should stop on it own, so just wait for it. + receivingThreadOption.foreach(_.join()) } } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala index a08578680cff..068a6cb0e8fa 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala @@ -100,8 +100,8 @@ class UISeleniumSuite // Check stat table val statTableHeaders = findAll(cssSelector("#stat-table th")).map(_.text).toSeq statTableHeaders.exists( - _.matches("Timelines \\(Last \\d+ batches, \\d+ active, \\d+ completed\\)")) should be - (true) + _.matches("Timelines \\(Last \\d+ batches, \\d+ active, \\d+ completed\\)") + ) should be (true) statTableHeaders should contain ("Histograms") val statTableCells = findAll(cssSelector("#stat-table td")).map(_.text).toSeq diff --git a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala new file mode 100644 index 000000000000..a38cc603f219 --- /dev/null +++ b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.receiver + +import scala.collection.mutable + +import org.scalatest.BeforeAndAfter +import org.scalatest.Matchers._ +import org.scalatest.concurrent.Timeouts._ +import org.scalatest.concurrent.Eventually._ +import org.scalatest.time.SpanSugar._ + +import org.apache.spark.storage.StreamBlockId +import org.apache.spark.util.ManualClock +import org.apache.spark.{SparkException, SparkConf, SparkFunSuite} + +class BlockGeneratorSuite extends SparkFunSuite with BeforeAndAfter { + + private val blockIntervalMs = 10 + private val conf = new SparkConf().set("spark.streaming.blockInterval", s"${blockIntervalMs}ms") + @volatile private var blockGenerator: BlockGenerator = null + + after { + if (blockGenerator != null) { + blockGenerator.stop() + } + } + + test("block generation and data callbacks") { + val listener = new TestBlockGeneratorListener + val clock = new ManualClock() + + require(blockIntervalMs > 5) + require(listener.onAddDataCalled === false) + require(listener.onGenerateBlockCalled === false) + require(listener.onPushBlockCalled === false) + + // Verify that creating the generator does not start it + blockGenerator = new BlockGenerator(listener, 0, conf, clock) + assert(blockGenerator.isActive() === false, "block generator active before start()") + assert(blockGenerator.isStopped() === false, "block generator stopped before start()") + assert(listener.onAddDataCalled === false) + assert(listener.onGenerateBlockCalled === false) + assert(listener.onPushBlockCalled === false) + + // Verify start marks the generator active, but does not call the callbacks + blockGenerator.start() + assert(blockGenerator.isActive() === true, "block generator active after start()") + assert(blockGenerator.isStopped() === false, "block generator stopped after start()") + withClue("callbacks called before adding data") { + assert(listener.onAddDataCalled === false) + assert(listener.onGenerateBlockCalled === false) + assert(listener.onPushBlockCalled === false) + } + + // Verify whether addData() adds data that is present in generated blocks + val data1 = 1 to 10 + data1.foreach { blockGenerator.addData _ } + withClue("callbacks called on adding data without metadata and without block generation") { + assert(listener.onAddDataCalled === false) // should be called only with addDataWithCallback() + assert(listener.onGenerateBlockCalled === false) + assert(listener.onPushBlockCalled === false) + } + clock.advance(blockIntervalMs) // advance clock to generate blocks + withClue("blocks not generated or pushed") { + eventually(timeout(1 second)) { + assert(listener.onGenerateBlockCalled === true) + assert(listener.onPushBlockCalled === true) + } + } + listener.pushedData should contain theSameElementsInOrderAs (data1) + assert(listener.onAddDataCalled === false) // should be called only with addDataWithCallback() + + // Verify addDataWithCallback() add data+metadata and and callbacks are called correctly + val data2 = 11 to 20 + val metadata2 = data2.map { _.toString } + data2.zip(metadata2).foreach { case (d, m) => blockGenerator.addDataWithCallback(d, m) } + assert(listener.onAddDataCalled === true) + listener.addedData should contain theSameElementsInOrderAs (data2) + listener.addedMetadata should contain theSameElementsInOrderAs (metadata2) + clock.advance(blockIntervalMs) // advance clock to generate blocks + eventually(timeout(1 second)) { + listener.pushedData should contain theSameElementsInOrderAs (data1 ++ data2) + } + + // Verify addMultipleDataWithCallback() add data+metadata and and callbacks are called correctly + val data3 = 21 to 30 + val metadata3 = "metadata" + blockGenerator.addMultipleDataWithCallback(data3.iterator, metadata3) + listener.addedMetadata should contain theSameElementsInOrderAs (metadata2 :+ metadata3) + clock.advance(blockIntervalMs) // advance clock to generate blocks + eventually(timeout(1 second)) { + listener.pushedData should contain theSameElementsInOrderAs (data1 ++ data2 ++ data3) + } + + // Stop the block generator by starting the stop on a different thread and + // then advancing the manual clock for the stopping to proceed. + val thread = stopBlockGenerator(blockGenerator) + eventually(timeout(1 second), interval(10 milliseconds)) { + clock.advance(blockIntervalMs) + assert(blockGenerator.isStopped() === true) + } + thread.join() + + // Verify that the generator cannot be used any more + intercept[SparkException] { + blockGenerator.addData(1) + } + intercept[SparkException] { + blockGenerator.addDataWithCallback(1, 1) + } + intercept[SparkException] { + blockGenerator.addMultipleDataWithCallback(Iterator(1), 1) + } + intercept[SparkException] { + blockGenerator.start() + } + blockGenerator.stop() // Calling stop again should be fine + } + + test("stop ensures correct shutdown") { + val listener = new TestBlockGeneratorListener + val clock = new ManualClock() + blockGenerator = new BlockGenerator(listener, 0, conf, clock) + require(listener.onGenerateBlockCalled === false) + blockGenerator.start() + assert(blockGenerator.isActive() === true, "block generator") + assert(blockGenerator.isStopped() === false) + + val data = 1 to 1000 + data.foreach { blockGenerator.addData _ } + + // Verify that stop() shutdowns everything in the right order + // - First, stop receiving new data + // - Second, wait for final block with all buffered data to be generated + // - Finally, wait for all blocks to be pushed + clock.advance(1) // to make sure that the timer for another interval to complete + val thread = stopBlockGenerator(blockGenerator) + eventually(timeout(1 second), interval(10 milliseconds)) { + assert(blockGenerator.isActive() === false) + } + assert(blockGenerator.isStopped() === false) + + // Verify that data cannot be added + intercept[SparkException] { + blockGenerator.addData(1) + } + intercept[SparkException] { + blockGenerator.addDataWithCallback(1, null) + } + intercept[SparkException] { + blockGenerator.addMultipleDataWithCallback(Iterator(1), null) + } + + // Verify that stop() stays blocked until another block containing all the data is generated + // This intercept always succeeds, as the body either will either throw a timeout exception + // (expected as stop() should never complete) or a SparkException (unexpected as stop() + // completed and thread terminated). + val exception = intercept[Exception] { + failAfter(200 milliseconds) { + thread.join() + throw new SparkException( + "BlockGenerator.stop() completed before generating timer was stopped") + } + } + exception should not be a [SparkException] + + + // Verify that the final data is present in the final generated block and + // pushed before complete stop + assert(blockGenerator.isStopped() === false) // generator has not stopped yet + clock.advance(blockIntervalMs) // force block generation + failAfter(1 second) { + thread.join() + } + assert(blockGenerator.isStopped() === true) // generator has finally been completely stopped + assert(listener.pushedData === data, "All data not pushed by stop()") + } + + test("block push errors are reported") { + val listener = new TestBlockGeneratorListener { + @volatile var errorReported = false + override def onPushBlock( + blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[_]): Unit = { + throw new SparkException("test") + } + override def onError(message: String, throwable: Throwable): Unit = { + errorReported = true + } + } + blockGenerator = new BlockGenerator(listener, 0, conf) + blockGenerator.start() + assert(listener.errorReported === false) + blockGenerator.addData(1) + eventually(timeout(1 second), interval(10 milliseconds)) { + assert(listener.errorReported === true) + } + blockGenerator.stop() + } + + /** + * Helper method to stop the block generator with manual clock in a different thread, + * so that the main thread can advance the clock that allows the stopping to proceed. + */ + private def stopBlockGenerator(blockGenerator: BlockGenerator): Thread = { + val thread = new Thread() { + override def run(): Unit = { + blockGenerator.stop() + } + } + thread.start() + thread + } + + /** A listener for BlockGenerator that records the data in the callbacks */ + private class TestBlockGeneratorListener extends BlockGeneratorListener { + val pushedData = new mutable.ArrayBuffer[Any] with mutable.SynchronizedBuffer[Any] + val addedData = new mutable.ArrayBuffer[Any] with mutable.SynchronizedBuffer[Any] + val addedMetadata = new mutable.ArrayBuffer[Any] with mutable.SynchronizedBuffer[Any] + @volatile var onGenerateBlockCalled = false + @volatile var onAddDataCalled = false + @volatile var onPushBlockCalled = false + + override def onPushBlock(blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[_]): Unit = { + pushedData ++= arrayBuffer + onPushBlockCalled = true + } + override def onError(message: String, throwable: Throwable): Unit = {} + override def onGenerateBlock(blockId: StreamBlockId): Unit = { + onGenerateBlockCalled = true + } + override def onAddData(data: Any, metadata: Any): Unit = { + addedData += data + addedMetadata += metadata + onAddDataCalled = true + } + } +} diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala index 921da773f6c1..1eb52b7029a2 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala @@ -18,10 +18,7 @@ package org.apache.spark.streaming.scheduler import scala.collection.mutable -import scala.reflect.ClassTag -import scala.util.control.NonFatal -import org.scalatest.Matchers._ import org.scalatest.concurrent.Eventually._ import org.scalatest.time.SpanSugar._ @@ -32,72 +29,63 @@ class RateControllerSuite extends TestSuiteBase { override def useManualClock: Boolean = false - test("rate controller publishes updates") { + override def batchDuration: Duration = Milliseconds(50) + + test("RateController - rate controller publishes updates after batches complete") { val ssc = new StreamingContext(conf, batchDuration) withStreamingContext(ssc) { ssc => - val dstream = new RateLimitInputDStream(ssc) + val dstream = new RateTestInputDStream(ssc) dstream.register() ssc.start() eventually(timeout(10.seconds)) { - assert(dstream.publishCalls > 0) + assert(dstream.publishedRates > 0) } } } - test("publish rates reach receivers") { + test("ReceiverRateController - published rates reach receivers") { val ssc = new StreamingContext(conf, batchDuration) withStreamingContext(ssc) { ssc => - val dstream = new RateLimitInputDStream(ssc) { + val estimator = new ConstantEstimator(100) + val dstream = new RateTestInputDStream(ssc) { override val rateController = - Some(new ReceiverRateController(id, new ConstantEstimator(200.0))) + Some(new ReceiverRateController(id, estimator)) } dstream.register() - SingletonTestRateReceiver.reset() ssc.start() - eventually(timeout(10.seconds)) { - assert(dstream.getCurrentRateLimit === Some(200)) + // Wait for receiver to start + eventually(timeout(5.seconds)) { + RateTestReceiver.getActive().nonEmpty } - } - } - test("multiple publish rates reach receivers") { - val ssc = new StreamingContext(conf, batchDuration) - withStreamingContext(ssc) { ssc => - val rates = Seq(100L, 200L, 300L) - - val dstream = new RateLimitInputDStream(ssc) { - override val rateController = - Some(new ReceiverRateController(id, new ConstantEstimator(rates.map(_.toDouble): _*))) + // Update rate in the estimator and verify whether the rate was published to the receiver + def updateRateAndVerify(rate: Long): Unit = { + estimator.updateRate(rate) + eventually(timeout(5.seconds)) { + assert(RateTestReceiver.getActive().get.getDefaultBlockGeneratorRateLimit() === rate) + } } - SingletonTestRateReceiver.reset() - dstream.register() - - val observedRates = mutable.HashSet.empty[Long] - ssc.start() - eventually(timeout(20.seconds)) { - dstream.getCurrentRateLimit.foreach(observedRates += _) - // Long.MaxValue (essentially, no rate limit) is the initial rate limit for any Receiver - observedRates should contain theSameElementsAs (rates :+ Long.MaxValue) + // Verify multiple rate update + Seq(100, 200, 300).foreach { rate => + updateRateAndVerify(rate) } } } } -private[streaming] class ConstantEstimator(rates: Double*) extends RateEstimator { - private var idx: Int = 0 +private[streaming] class ConstantEstimator(@volatile private var rate: Long) + extends RateEstimator { - private def nextRate(): Double = { - val rate = rates(idx) - idx = (idx + 1) % rates.size - rate + def updateRate(newRate: Long): Unit = { + rate = newRate } def compute( time: Long, elements: Long, processingDelay: Long, - schedulingDelay: Long): Option[Double] = Some(nextRate()) + schedulingDelay: Long): Option[Double] = Some(rate) } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala index 0418d776ecc9..b2a51d72bac2 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala @@ -39,7 +39,7 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite { assert(scheduledExecutors.toSet === Set("host1", "host2")) } - test("rescheduleReceiver: return all idle executors if more than 3 idle executors") { + test("rescheduleReceiver: return all idle executors if there are any idle executors") { val executors = Seq("host1", "host2", "host3", "host4", "host5") // host3 is idle val receiverTrackingInfoMap = Map( @@ -49,16 +49,16 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite { assert(scheduledExecutors.toSet === Set("host2", "host3", "host4", "host5")) } - test("rescheduleReceiver: return 3 best options if less than 3 idle executors") { + test("rescheduleReceiver: return all executors that have minimum weight if no idle executors") { val executors = Seq("host1", "host2", "host3", "host4", "host5") - // Weights: host1 = 1.5, host2 = 0.5, host3 = 1.0 - // host4 and host5 are idle + // Weights: host1 = 1.5, host2 = 0.5, host3 = 1.0, host4 = 0.5, host5 = 0.5 val receiverTrackingInfoMap = Map( 0 -> ReceiverTrackingInfo(0, ReceiverState.ACTIVE, None, Some("host1")), 1 -> ReceiverTrackingInfo(1, ReceiverState.SCHEDULED, Some(Seq("host2", "host3")), None), - 2 -> ReceiverTrackingInfo(1, ReceiverState.SCHEDULED, Some(Seq("host1", "host3")), None)) + 2 -> ReceiverTrackingInfo(2, ReceiverState.SCHEDULED, Some(Seq("host1", "host3")), None), + 3 -> ReceiverTrackingInfo(4, ReceiverState.SCHEDULED, Some(Seq("host4", "host5")), None)) val scheduledExecutors = receiverSchedulingPolicy.rescheduleReceiver( - 3, None, receiverTrackingInfoMap, executors) + 4, None, receiverTrackingInfoMap, executors) assert(scheduledExecutors.toSet === Set("host2", "host4", "host5")) } @@ -127,4 +127,5 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite { assert(executors.isEmpty) } } + } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala index afad5f16dbc7..45138b748eca 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala @@ -17,119 +17,169 @@ package org.apache.spark.streaming.scheduler +import scala.collection.mutable.ArrayBuffer + import org.scalatest.concurrent.Eventually._ import org.scalatest.time.SpanSugar._ -import org.apache.spark.SparkConf +import org.apache.spark.storage.{StorageLevel, StreamBlockId} import org.apache.spark.streaming._ -import org.apache.spark.streaming.receiver._ import org.apache.spark.streaming.dstream.ReceiverInputDStream -import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.receiver._ /** Testsuite for receiver scheduling */ class ReceiverTrackerSuite extends TestSuiteBase { - val sparkConf = new SparkConf().setMaster("local[8]").setAppName("test") - - test("Receiver tracker - propagates rate limit") { - withStreamingContext(new StreamingContext(sparkConf, Milliseconds(100))) { ssc => - object ReceiverStartedWaiter extends StreamingListener { - @volatile - var started = false - override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = { - started = true - } - } - - ssc.addStreamingListener(ReceiverStartedWaiter) + test("send rate update to receivers") { + withStreamingContext(new StreamingContext(conf, Milliseconds(100))) { ssc => ssc.scheduler.listenerBus.start(ssc.sc) - SingletonTestRateReceiver.reset() val newRateLimit = 100L - val inputDStream = new RateLimitInputDStream(ssc) + val inputDStream = new RateTestInputDStream(ssc) val tracker = new ReceiverTracker(ssc) tracker.start() try { // we wait until the Receiver has registered with the tracker, // otherwise our rate update is lost eventually(timeout(5 seconds)) { - assert(ReceiverStartedWaiter.started) + assert(RateTestReceiver.getActive().nonEmpty) } + + + // Verify that the rate of the block generator in the receiver get updated + val activeReceiver = RateTestReceiver.getActive().get tracker.sendRateUpdate(inputDStream.id, newRateLimit) - // this is an async message, we need to wait a bit for it to be processed - eventually(timeout(3 seconds)) { - assert(inputDStream.getCurrentRateLimit.get === newRateLimit) + eventually(timeout(5 seconds)) { + assert(activeReceiver.getDefaultBlockGeneratorRateLimit() === newRateLimit, + "default block generator did not receive rate update") + assert(activeReceiver.getCustomBlockGeneratorRateLimit() === newRateLimit, + "other block generator did not receive rate update") } } finally { tracker.stop(false) } } } + + test("should restart receiver after stopping it") { + withStreamingContext(new StreamingContext(conf, Milliseconds(100))) { ssc => + @volatile var startTimes = 0 + ssc.addStreamingListener(new StreamingListener { + override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = { + startTimes += 1 + } + }) + val input = ssc.receiverStream(new StoppableReceiver) + val output = new TestOutputStream(input) + output.register() + ssc.start() + StoppableReceiver.shouldStop = true + eventually(timeout(10 seconds), interval(10 millis)) { + // The receiver is stopped once, so if it's restarted, it should be started twice. + assert(startTimes === 2) + } + } + } } -/** - * An input DStream with a hard-coded receiver that gives access to internals for testing. - * - * @note Make sure to call {{{SingletonDummyReceiver.reset()}}} before using this in a test, - * or otherwise you may get {{{NotSerializableException}}} when trying to serialize - * the receiver. - * @see [[[SingletonDummyReceiver]]]. - */ -private[streaming] class RateLimitInputDStream(@transient ssc_ : StreamingContext) +/** An input DStream with for testing rate controlling */ +private[streaming] class RateTestInputDStream(@transient ssc_ : StreamingContext) extends ReceiverInputDStream[Int](ssc_) { - override def getReceiver(): RateTestReceiver = SingletonTestRateReceiver - - def getCurrentRateLimit: Option[Long] = { - invokeExecutorMethod.getCurrentRateLimit - } + override def getReceiver(): Receiver[Int] = new RateTestReceiver(id) @volatile - var publishCalls = 0 + var publishedRates = 0 override val rateController: Option[RateController] = { - Some(new RateController(id, new ConstantEstimator(100.0)) { + Some(new RateController(id, new ConstantEstimator(100)) { override def publish(rate: Long): Unit = { - publishCalls += 1 + publishedRates += 1 } }) } +} + +/** A receiver implementation for testing rate controlling */ +private[streaming] class RateTestReceiver(receiverId: Int, host: Option[String] = None) + extends Receiver[Int](StorageLevel.MEMORY_ONLY) { + + private lazy val customBlockGenerator = supervisor.createBlockGenerator( + new BlockGeneratorListener { + override def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]): Unit = {} + override def onError(message: String, throwable: Throwable): Unit = {} + override def onGenerateBlock(blockId: StreamBlockId): Unit = {} + override def onAddData(data: Any, metadata: Any): Unit = {} + } + ) + + setReceiverId(receiverId) + + override def onStart(): Unit = { + customBlockGenerator + RateTestReceiver.registerReceiver(this) + } - private def invokeExecutorMethod: ReceiverSupervisor = { - val c = classOf[Receiver[_]] - val ex = c.getDeclaredMethod("executor") - ex.setAccessible(true) - ex.invoke(SingletonTestRateReceiver).asInstanceOf[ReceiverSupervisor] + override def onStop(): Unit = { + RateTestReceiver.deregisterReceiver() + } + + override def preferredLocation: Option[String] = host + + def getDefaultBlockGeneratorRateLimit(): Long = { + supervisor.getCurrentRateLimit + } + + def getCustomBlockGeneratorRateLimit(): Long = { + customBlockGenerator.getCurrentLimit } } /** - * A Receiver as an object so we can read its rate limit. Make sure to call `reset()` when - * reusing this receiver, otherwise a non-null `executor_` field will prevent it from being - * serialized when receivers are installed on executors. - * - * @note It's necessary to be a top-level object, or else serialization would create another - * one on the executor side and we won't be able to read its rate limit. + * A helper object to RateTestReceiver that give access to the currently active RateTestReceiver + * instance. */ -private[streaming] object SingletonTestRateReceiver extends RateTestReceiver(0) { +private[streaming] object RateTestReceiver { + @volatile private var activeReceiver: RateTestReceiver = null - /** Reset the object to be usable in another test. */ - def reset(): Unit = { - executor_ = null + def registerReceiver(receiver: RateTestReceiver): Unit = { + activeReceiver = receiver } + + def deregisterReceiver(): Unit = { + activeReceiver = null + } + + def getActive(): Option[RateTestReceiver] = Option(activeReceiver) } /** - * Dummy receiver implementation + * A custom receiver that could be stopped via StoppableReceiver.shouldStop */ -private[streaming] class RateTestReceiver(receiverId: Int, host: Option[String] = None) - extends Receiver[Int](StorageLevel.MEMORY_ONLY) { +class StoppableReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) { - setReceiverId(receiverId) + var receivingThreadOption: Option[Thread] = None - override def onStart(): Unit = {} + def onStart() { + val thread = new Thread() { + override def run() { + while (!StoppableReceiver.shouldStop) { + Thread.sleep(10) + } + StoppableReceiver.this.stop("stop") + } + } + thread.start() + } - override def onStop(): Unit = {} + def onStop() { + StoppableReceiver.shouldStop = true + receivingThreadOption.foreach(_.join()) + // Reset it so as to restart it + StoppableReceiver.shouldStop = false + } +} - override def preferredLocation: Option[String] = host +object StoppableReceiver { + @volatile var shouldStop = false } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimatorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimatorSuite.scala index 97c32d8f2d59..a1af95be81c8 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimatorSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimatorSuite.scala @@ -36,72 +36,89 @@ class PIDRateEstimatorSuite extends SparkFunSuite with Matchers { test("estimator checks ranges") { intercept[IllegalArgumentException] { - new PIDRateEstimator(0, 1, 2, 3) + new PIDRateEstimator(batchIntervalMillis = 0, 1, 2, 3, 10) } intercept[IllegalArgumentException] { - new PIDRateEstimator(100, -1, 2, 3) + new PIDRateEstimator(100, proportional = -1, 2, 3, 10) } intercept[IllegalArgumentException] { - new PIDRateEstimator(100, 0, -1, 3) + new PIDRateEstimator(100, 0, integral = -1, 3, 10) } intercept[IllegalArgumentException] { - new PIDRateEstimator(100, 0, 0, -1) + new PIDRateEstimator(100, 0, 0, derivative = -1, 10) + } + intercept[IllegalArgumentException] { + new PIDRateEstimator(100, 0, 0, 0, minRate = 0) + } + intercept[IllegalArgumentException] { + new PIDRateEstimator(100, 0, 0, 0, minRate = -10) } } - private def createDefaultEstimator: PIDRateEstimator = { - new PIDRateEstimator(20, 1D, 0D, 0D) - } - - test("first bound is None") { - val p = createDefaultEstimator + test("first estimate is None") { + val p = createDefaultEstimator() p.compute(0, 10, 10, 0) should equal(None) } - test("second bound is rate") { - val p = createDefaultEstimator + test("second estimate is not None") { + val p = createDefaultEstimator() p.compute(0, 10, 10, 0) // 1000 elements / s p.compute(10, 10, 10, 0) should equal(Some(1000)) } - test("works even with no time between updates") { - val p = createDefaultEstimator + test("no estimate when no time difference between successive calls") { + val p = createDefaultEstimator() + p.compute(0, 10, 10, 0) + p.compute(time = 10, 10, 10, 0) shouldNot equal(None) + p.compute(time = 10, 10, 10, 0) should equal(None) + } + + test("no estimate when no records in previous batch") { + val p = createDefaultEstimator() p.compute(0, 10, 10, 0) - p.compute(10, 10, 10, 0) - p.compute(10, 10, 10, 0) should equal(None) + p.compute(10, numElements = 0, 10, 0) should equal(None) + p.compute(20, numElements = -10, 10, 0) should equal(None) } - test("bound is never negative") { - val p = new PIDRateEstimator(20, 1D, 1D, 0D) + test("no estimate when there is no processing delay") { + val p = createDefaultEstimator() + p.compute(0, 10, 10, 0) + p.compute(10, 10, processingDelay = 0, 0) should equal(None) + p.compute(20, 10, processingDelay = -10, 0) should equal(None) + } + + test("estimate is never less than min rate") { + val minRate = 5D + val p = new PIDRateEstimator(20, 1D, 1D, 0D, minRate) // prepare a series of batch updates, one every 20ms, 0 processed elements, 2ms of processing // this might point the estimator to try and decrease the bound, but we test it never - // goes below zero, which would be nonsensical. + // goes below the min rate, which would be nonsensical. val times = List.tabulate(50)(x => x * 20) // every 20ms - val elements = List.fill(50)(0) // no processing + val elements = List.fill(50)(1) // no processing val proc = List.fill(50)(20) // 20ms of processing val sched = List.fill(50)(100) // strictly positive accumulation val res = for (i <- List.range(0, 50)) yield p.compute(times(i), elements(i), proc(i), sched(i)) res.head should equal(None) - res.tail should equal(List.fill(49)(Some(0D))) + res.tail should equal(List.fill(49)(Some(minRate))) } test("with no accumulated or positive error, |I| > 0, follow the processing speed") { - val p = new PIDRateEstimator(20, 1D, 1D, 0D) + val p = new PIDRateEstimator(20, 1D, 1D, 0D, 10) // prepare a series of batch updates, one every 20ms with an increasing number of processed // elements in each batch, but constant processing time, and no accumulated error. Even though // the integral part is non-zero, the estimated rate should follow only the proportional term val times = List.tabulate(50)(x => x * 20) // every 20ms - val elements = List.tabulate(50)(x => x * 20) // increasing + val elements = List.tabulate(50)(x => (x + 1) * 20) // increasing val proc = List.fill(50)(20) // 20ms of processing val sched = List.fill(50)(0) val res = for (i <- List.range(0, 50)) yield p.compute(times(i), elements(i), proc(i), sched(i)) res.head should equal(None) - res.tail should equal(List.tabulate(50)(x => Some(x * 1000D)).tail) + res.tail should equal(List.tabulate(50)(x => Some((x + 1) * 1000D)).tail) } test("with no accumulated but some positive error, |I| > 0, follow the processing speed") { - val p = new PIDRateEstimator(20, 1D, 1D, 0D) + val p = new PIDRateEstimator(20, 1D, 1D, 0D, 10) // prepare a series of batch updates, one every 20ms with an decreasing number of processed // elements in each batch, but constant processing time, and no accumulated error. Even though // the integral part is non-zero, the estimated rate should follow only the proportional term, @@ -116,13 +133,14 @@ class PIDRateEstimatorSuite extends SparkFunSuite with Matchers { } test("with some accumulated and some positive error, |I| > 0, stay below the processing speed") { - val p = new PIDRateEstimator(20, 1D, .01D, 0D) + val minRate = 10D + val p = new PIDRateEstimator(20, 1D, .01D, 0D, minRate) val times = List.tabulate(50)(x => x * 20) // every 20ms val rng = new Random() - val elements = List.tabulate(50)(x => rng.nextInt(1000)) + val elements = List.tabulate(50)(x => rng.nextInt(1000) + 1000) val procDelayMs = 20 val proc = List.fill(50)(procDelayMs) // 20ms of processing - val sched = List.tabulate(50)(x => rng.nextInt(19)) // random wait + val sched = List.tabulate(50)(x => rng.nextInt(19) + 1) // random wait val speeds = elements map ((x) => x.toDouble / procDelayMs * 1000) val res = for (i <- List.range(0, 50)) yield p.compute(times(i), elements(i), proc(i), sched(i)) @@ -131,7 +149,12 @@ class PIDRateEstimatorSuite extends SparkFunSuite with Matchers { res(n) should not be None if (res(n).get > 0 && sched(n) > 0) { res(n).get should be < speeds(n) + res(n).get should be >= minRate } } } + + private def createDefaultEstimator(): PIDRateEstimator = { + new PIDRateEstimator(20, 1D, 0D, 0D, 10) + } } diff --git a/tools/pom.xml b/tools/pom.xml index 298ee2348b58..2b561245437a 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml diff --git a/unsafe/pom.xml b/unsafe/pom.xml index 89475ee3cf5a..8c04d7f97cb5 100644 --- a/unsafe/pom.xml +++ b/unsafe/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/PlatformDependent.java b/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java similarity index 52% rename from unsafe/src/main/java/org/apache/spark/unsafe/PlatformDependent.java rename to unsafe/src/main/java/org/apache/spark/unsafe/Platform.java index 192c6714b240..1c16da982923 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/PlatformDependent.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java @@ -21,91 +21,107 @@ import sun.misc.Unsafe; -public final class PlatformDependent { +public final class Platform { - /** - * Facade in front of {@link sun.misc.Unsafe}, used to avoid directly exposing Unsafe outside of - * this package. This also lets us avoid accidental use of deprecated methods. - */ - public static final class UNSAFE { + private static final Unsafe _UNSAFE; - private UNSAFE() { } + public static final int BYTE_ARRAY_OFFSET; - public static int getInt(Object object, long offset) { - return _UNSAFE.getInt(object, offset); - } + public static final int INT_ARRAY_OFFSET; - public static void putInt(Object object, long offset, int value) { - _UNSAFE.putInt(object, offset, value); - } + public static final int LONG_ARRAY_OFFSET; - public static boolean getBoolean(Object object, long offset) { - return _UNSAFE.getBoolean(object, offset); - } + public static final int DOUBLE_ARRAY_OFFSET; - public static void putBoolean(Object object, long offset, boolean value) { - _UNSAFE.putBoolean(object, offset, value); - } + public static int getInt(Object object, long offset) { + return _UNSAFE.getInt(object, offset); + } - public static byte getByte(Object object, long offset) { - return _UNSAFE.getByte(object, offset); - } + public static void putInt(Object object, long offset, int value) { + _UNSAFE.putInt(object, offset, value); + } - public static void putByte(Object object, long offset, byte value) { - _UNSAFE.putByte(object, offset, value); - } + public static boolean getBoolean(Object object, long offset) { + return _UNSAFE.getBoolean(object, offset); + } - public static short getShort(Object object, long offset) { - return _UNSAFE.getShort(object, offset); - } + public static void putBoolean(Object object, long offset, boolean value) { + _UNSAFE.putBoolean(object, offset, value); + } - public static void putShort(Object object, long offset, short value) { - _UNSAFE.putShort(object, offset, value); - } + public static byte getByte(Object object, long offset) { + return _UNSAFE.getByte(object, offset); + } - public static long getLong(Object object, long offset) { - return _UNSAFE.getLong(object, offset); - } + public static void putByte(Object object, long offset, byte value) { + _UNSAFE.putByte(object, offset, value); + } - public static void putLong(Object object, long offset, long value) { - _UNSAFE.putLong(object, offset, value); - } + public static short getShort(Object object, long offset) { + return _UNSAFE.getShort(object, offset); + } - public static float getFloat(Object object, long offset) { - return _UNSAFE.getFloat(object, offset); - } + public static void putShort(Object object, long offset, short value) { + _UNSAFE.putShort(object, offset, value); + } - public static void putFloat(Object object, long offset, float value) { - _UNSAFE.putFloat(object, offset, value); - } + public static long getLong(Object object, long offset) { + return _UNSAFE.getLong(object, offset); + } - public static double getDouble(Object object, long offset) { - return _UNSAFE.getDouble(object, offset); - } + public static void putLong(Object object, long offset, long value) { + _UNSAFE.putLong(object, offset, value); + } - public static void putDouble(Object object, long offset, double value) { - _UNSAFE.putDouble(object, offset, value); - } + public static float getFloat(Object object, long offset) { + return _UNSAFE.getFloat(object, offset); + } - public static long allocateMemory(long size) { - return _UNSAFE.allocateMemory(size); - } + public static void putFloat(Object object, long offset, float value) { + _UNSAFE.putFloat(object, offset, value); + } - public static void freeMemory(long address) { - _UNSAFE.freeMemory(address); - } + public static double getDouble(Object object, long offset) { + return _UNSAFE.getDouble(object, offset); + } + public static void putDouble(Object object, long offset, double value) { + _UNSAFE.putDouble(object, offset, value); } - private static final Unsafe _UNSAFE; + public static Object getObjectVolatile(Object object, long offset) { + return _UNSAFE.getObjectVolatile(object, offset); + } - public static final int BYTE_ARRAY_OFFSET; + public static void putObjectVolatile(Object object, long offset, Object value) { + _UNSAFE.putObjectVolatile(object, offset, value); + } - public static final int INT_ARRAY_OFFSET; + public static long allocateMemory(long size) { + return _UNSAFE.allocateMemory(size); + } - public static final int LONG_ARRAY_OFFSET; + public static void freeMemory(long address) { + _UNSAFE.freeMemory(address); + } - public static final int DOUBLE_ARRAY_OFFSET; + public static void copyMemory( + Object src, long srcOffset, Object dst, long dstOffset, long length) { + while (length > 0) { + long size = Math.min(length, UNSAFE_COPY_THRESHOLD); + _UNSAFE.copyMemory(src, srcOffset, dst, dstOffset, size); + length -= size; + srcOffset += size; + dstOffset += size; + } + } + + /** + * Raises an exception bypassing compiler checks for checked exceptions. + */ + public static void throwException(Throwable t) { + _UNSAFE.throwException(t); + } /** * Limits the number of bytes to copy per {@link Unsafe#copyMemory(long, long, long)} to @@ -136,26 +152,4 @@ public static void freeMemory(long address) { DOUBLE_ARRAY_OFFSET = 0; } } - - static public void copyMemory( - Object src, - long srcOffset, - Object dst, - long dstOffset, - long length) { - while (length > 0) { - long size = Math.min(length, UNSAFE_COPY_THRESHOLD); - _UNSAFE.copyMemory(src, srcOffset, dst, dstOffset, size); - length -= size; - srcOffset += size; - dstOffset += size; - } - } - - /** - * Raises an exception bypassing compiler checks for checked exceptions. - */ - public static void throwException(Throwable t) { - _UNSAFE.throwException(t); - } } diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java b/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java index cf693d01a4f5..cf42877bf9fd 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java @@ -17,7 +17,7 @@ package org.apache.spark.unsafe.array; -import static org.apache.spark.unsafe.PlatformDependent.*; +import org.apache.spark.unsafe.Platform; public class ByteArrayMethods { @@ -25,6 +25,12 @@ private ByteArrayMethods() { // Private constructor, since this class only contains static methods. } + /** Returns the next number greater or equal num that is power of 2. */ + public static long nextPowerOf2(long num) { + final long highBit = Long.highestOneBit(num); + return (highBit == num) ? num : highBit << 1; + } + public static int roundNumberOfBytesToNearestWord(int numBytes) { int remainder = numBytes & 0x07; // This is equivalent to `numBytes % 8` if (remainder == 0) { @@ -39,20 +45,18 @@ public static int roundNumberOfBytesToNearestWord(int numBytes) { * @return true if the arrays are equal, false otherwise */ public static boolean arrayEquals( - Object leftBase, - long leftOffset, - Object rightBase, - long rightOffset, - final long length) { + Object leftBase, long leftOffset, Object rightBase, long rightOffset, final long length) { int i = 0; while (i <= length - 8) { - if (UNSAFE.getLong(leftBase, leftOffset + i) != UNSAFE.getLong(rightBase, rightOffset + i)) { + if (Platform.getLong(leftBase, leftOffset + i) != + Platform.getLong(rightBase, rightOffset + i)) { return false; } i += 8; } while (i < length) { - if (UNSAFE.getByte(leftBase, leftOffset + i) != UNSAFE.getByte(rightBase, rightOffset + i)) { + if (Platform.getByte(leftBase, leftOffset + i) != + Platform.getByte(rightBase, rightOffset + i)) { return false; } i += 1; diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java b/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java index 18d1f0d2d7eb..74105050e419 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java @@ -17,7 +17,7 @@ package org.apache.spark.unsafe.array; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.memory.MemoryBlock; /** @@ -64,7 +64,7 @@ public long size() { public void set(int index, long value) { assert index >= 0 : "index (" + index + ") should >= 0"; assert index < length : "index (" + index + ") should < length (" + length + ")"; - PlatformDependent.UNSAFE.putLong(baseObj, baseOffset + index * WIDTH, value); + Platform.putLong(baseObj, baseOffset + index * WIDTH, value); } /** @@ -73,6 +73,6 @@ public void set(int index, long value) { public long get(int index) { assert index >= 0 : "index (" + index + ") should >= 0"; assert index < length : "index (" + index + ") should < length (" + length + ")"; - return PlatformDependent.UNSAFE.getLong(baseObj, baseOffset + index * WIDTH); + return Platform.getLong(baseObj, baseOffset + index * WIDTH); } } diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java index 27462c7fa5e6..7857bf66a72a 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java @@ -17,7 +17,7 @@ package org.apache.spark.unsafe.bitset; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; /** * Methods for working with fixed-size uncompressed bitsets. @@ -41,8 +41,8 @@ public static void set(Object baseObject, long baseOffset, int index) { assert index >= 0 : "index (" + index + ") should >= 0"; final long mask = 1L << (index & 0x3f); // mod 64 and shift final long wordOffset = baseOffset + (index >> 6) * WORD_SIZE; - final long word = PlatformDependent.UNSAFE.getLong(baseObject, wordOffset); - PlatformDependent.UNSAFE.putLong(baseObject, wordOffset, word | mask); + final long word = Platform.getLong(baseObject, wordOffset); + Platform.putLong(baseObject, wordOffset, word | mask); } /** @@ -52,8 +52,8 @@ public static void unset(Object baseObject, long baseOffset, int index) { assert index >= 0 : "index (" + index + ") should >= 0"; final long mask = 1L << (index & 0x3f); // mod 64 and shift final long wordOffset = baseOffset + (index >> 6) * WORD_SIZE; - final long word = PlatformDependent.UNSAFE.getLong(baseObject, wordOffset); - PlatformDependent.UNSAFE.putLong(baseObject, wordOffset, word & ~mask); + final long word = Platform.getLong(baseObject, wordOffset); + Platform.putLong(baseObject, wordOffset, word & ~mask); } /** @@ -63,7 +63,7 @@ public static boolean isSet(Object baseObject, long baseOffset, int index) { assert index >= 0 : "index (" + index + ") should >= 0"; final long mask = 1L << (index & 0x3f); // mod 64 and shift final long wordOffset = baseOffset + (index >> 6) * WORD_SIZE; - final long word = PlatformDependent.UNSAFE.getLong(baseObject, wordOffset); + final long word = Platform.getLong(baseObject, wordOffset); return (word & mask) != 0; } @@ -73,7 +73,7 @@ public static boolean isSet(Object baseObject, long baseOffset, int index) { public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords) { long addr = baseOffset; for (int i = 0; i < bitSetWidthInWords; i++, addr += WORD_SIZE) { - if (PlatformDependent.UNSAFE.getLong(baseObject, addr) != 0) { + if (Platform.getLong(baseObject, addr) != 0) { return true; } } @@ -109,8 +109,7 @@ public static int nextSetBit( // Try to find the next set bit in the current word final int subIndex = fromIndex & 0x3f; - long word = - PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + wi * WORD_SIZE) >> subIndex; + long word = Platform.getLong(baseObject, baseOffset + wi * WORD_SIZE) >> subIndex; if (word != 0) { return (wi << 6) + subIndex + java.lang.Long.numberOfTrailingZeros(word); } @@ -118,7 +117,7 @@ public static int nextSetBit( // Find the next set bit in the rest of the words wi += 1; while (wi < bitsetSizeInWords) { - word = PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + wi * WORD_SIZE); + word = Platform.getLong(baseObject, baseOffset + wi * WORD_SIZE); if (word != 0) { return (wi << 6) + java.lang.Long.numberOfTrailingZeros(word); } diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java index 61f483ced321..4276f25c2165 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java @@ -17,7 +17,7 @@ package org.apache.spark.unsafe.hash; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; /** * 32-bit Murmur3 hasher. This is based on Guava's Murmur3_32HashFunction. @@ -53,7 +53,7 @@ public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, i assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of 8 (word-aligned)"; int h1 = seed; for (int i = 0; i < lengthInBytes; i += 4) { - int halfWord = PlatformDependent.UNSAFE.getInt(base, offset + i); + int halfWord = Platform.getInt(base, offset + i); int k1 = mixK1(halfWord); h1 = mixH1(h1, k1); } diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java index bbe83d36cf36..6722301df19d 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java @@ -24,6 +24,9 @@ public class HeapMemoryAllocator implements MemoryAllocator { @Override public MemoryBlock allocate(long size) throws OutOfMemoryError { + if (size % 8 != 0) { + throw new IllegalArgumentException("Size " + size + " was not a multiple of 8"); + } long[] array = new long[(int) (size / 8)]; return MemoryBlock.fromLongArray(array); } diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java index 91be46ba21ff..dd7582083437 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java @@ -19,7 +19,7 @@ import javax.annotation.Nullable; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; /** * A consecutive block of memory, starting at a {@link MemoryLocation} with a fixed size. @@ -50,6 +50,6 @@ public long size() { * Creates a memory block pointing to the memory used by the long array. */ public static MemoryBlock fromLongArray(final long[] array) { - return new MemoryBlock(array, PlatformDependent.LONG_ARRAY_OFFSET, array.length * 8); + return new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, array.length * 8); } } diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java index 358bb3725015..97b2c93f0dc3 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java @@ -60,7 +60,7 @@ public class TaskMemoryManager { /** * Maximum supported data page size (in bytes). In principle, the maximum addressable page size is - * (1L << OFFSET_BITS) bytes, which is 2+ petabytes. However, the on-heap allocator's maximum page + * (1L << OFFSET_BITS) bytes, which is 2+ petabytes. However, the on-heap allocator's maximum page * size is limited by the maximum amount of data that can be stored in a long[] array, which is * (2^32 - 1) * 8 bytes (or 16 gigabytes). Therefore, we cap this at 16 gigabytes. */ @@ -144,14 +144,16 @@ public MemoryBlock allocatePage(long size) { public void freePage(MemoryBlock page) { assert (page.pageNumber != -1) : "Called freePage() on memory that wasn't allocated with allocatePage()"; - executorMemoryManager.free(page); + assert(allocatedPages.get(page.pageNumber)); + pageTable[page.pageNumber] = null; synchronized (this) { allocatedPages.clear(page.pageNumber); } - pageTable[page.pageNumber] = null; if (logger.isTraceEnabled()) { logger.trace("Freed page number {} ({} bytes)", page.pageNumber, page.size()); } + // Cannot access a page once it's freed. + executorMemoryManager.free(page); } /** @@ -166,7 +168,9 @@ public void freePage(MemoryBlock page) { public MemoryBlock allocate(long size) throws OutOfMemoryError { assert(size > 0) : "Size must be positive, but got " + size; final MemoryBlock memory = executorMemoryManager.allocate(size); - allocatedNonPageMemory.add(memory); + synchronized(allocatedNonPageMemory) { + allocatedNonPageMemory.add(memory); + } return memory; } @@ -176,8 +180,10 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError { public void free(MemoryBlock memory) { assert (memory.pageNumber == -1) : "Should call freePage() for pages, not free()"; executorMemoryManager.free(memory); - final boolean wasAlreadyRemoved = !allocatedNonPageMemory.remove(memory); - assert (!wasAlreadyRemoved) : "Called free() on memory that was already freed!"; + synchronized(allocatedNonPageMemory) { + final boolean wasAlreadyRemoved = !allocatedNonPageMemory.remove(memory); + assert (!wasAlreadyRemoved) : "Called free() on memory that was already freed!"; + } } /** @@ -223,9 +229,10 @@ public Object getPage(long pagePlusOffsetAddress) { if (inHeap) { final int pageNumber = decodePageNumber(pagePlusOffsetAddress); assert (pageNumber >= 0 && pageNumber < PAGE_TABLE_SIZE); - final Object page = pageTable[pageNumber].getBaseObject(); + final MemoryBlock page = pageTable[pageNumber]; assert (page != null); - return page; + assert (page.getBaseObject() != null); + return page.getBaseObject(); } else { return null; } @@ -244,7 +251,9 @@ public long getOffsetInPage(long pagePlusOffsetAddress) { // converted the absolute address into a relative address. Here, we invert that operation: final int pageNumber = decodePageNumber(pagePlusOffsetAddress); assert (pageNumber >= 0 && pageNumber < PAGE_TABLE_SIZE); - return pageTable[pageNumber].getBaseOffset() + offsetInPage; + final MemoryBlock page = pageTable[pageNumber]; + assert (page != null); + return page.getBaseOffset() + offsetInPage; } } @@ -260,14 +269,17 @@ public long cleanUpAllAllocatedMemory() { freePage(page); } } - final Iterator iter = allocatedNonPageMemory.iterator(); - while (iter.hasNext()) { - final MemoryBlock memory = iter.next(); - freedBytes += memory.size(); - // We don't call free() here because that calls Set.remove, which would lead to a - // ConcurrentModificationException here. - executorMemoryManager.free(memory); - iter.remove(); + + synchronized (allocatedNonPageMemory) { + final Iterator iter = allocatedNonPageMemory.iterator(); + while (iter.hasNext()) { + final MemoryBlock memory = iter.next(); + freedBytes += memory.size(); + // We don't call free() here because that calls Set.remove, which would lead to a + // ConcurrentModificationException here. + executorMemoryManager.free(memory); + iter.remove(); + } } return freedBytes; } diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java index 15898771fef2..cda7826c8c99 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java @@ -17,7 +17,7 @@ package org.apache.spark.unsafe.memory; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; /** * A simple {@link MemoryAllocator} that uses {@code Unsafe} to allocate off-heap memory. @@ -26,7 +26,10 @@ public class UnsafeMemoryAllocator implements MemoryAllocator { @Override public MemoryBlock allocate(long size) throws OutOfMemoryError { - long address = PlatformDependent.UNSAFE.allocateMemory(size); + if (size % 8 != 0) { + throw new IllegalArgumentException("Size " + size + " was not a multiple of 8"); + } + long address = Platform.allocateMemory(size); return new MemoryBlock(null, address, size); } @@ -34,6 +37,6 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError { public void free(MemoryBlock memory) { assert (memory.obj == null) : "baseObject not null; are you trying to use the off-heap allocator to free on-heap memory?"; - PlatformDependent.UNSAFE.freeMemory(memory.offset); + Platform.freeMemory(memory.offset); } } diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java index 69b0e206cef1..c08c9c73d239 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java @@ -17,7 +17,7 @@ package org.apache.spark.unsafe.types; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; public class ByteArray { @@ -27,12 +27,6 @@ public class ByteArray { * hold all the bytes in this string. */ public static void writeToMemory(byte[] src, Object target, long targetOffset) { - PlatformDependent.copyMemory( - src, - PlatformDependent.BYTE_ARRAY_OFFSET, - target, - targetOffset, - src.length - ); + Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET, target, targetOffset, src.length); } } diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java index 92a5e4f86f23..30e175807636 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java @@ -50,6 +50,14 @@ private static String unitRegex(String unit) { unitRegex("week") + unitRegex("day") + unitRegex("hour") + unitRegex("minute") + unitRegex("second") + unitRegex("millisecond") + unitRegex("microsecond")); + private static Pattern yearMonthPattern = + Pattern.compile("^(?:['|\"])?([+|-])?(\\d+)-(\\d+)(?:['|\"])?$"); + + private static Pattern dayTimePattern = + Pattern.compile("^(?:['|\"])?([+|-])?(\\d+) (\\d+):(\\d+):(\\d+)(\\.(\\d+))?(?:['|\"])?$"); + + private static Pattern quoteTrimPattern = Pattern.compile("^(?:['|\"])?(.*?)(?:['|\"])?$"); + private static long toLong(String s) { if (s == null) { return 0; @@ -79,6 +87,154 @@ public static CalendarInterval fromString(String s) { } } + public static long toLongWithRange(String fieldName, + String s, long minValue, long maxValue) throws IllegalArgumentException { + long result = 0; + if (s != null) { + result = Long.valueOf(s); + if (result < minValue || result > maxValue) { + throw new IllegalArgumentException(String.format("%s %d outside range [%d, %d]", + fieldName, result, minValue, maxValue)); + } + } + return result; + } + + /** + * Parse YearMonth string in form: [-]YYYY-MM + * + * adapted from HiveIntervalYearMonth.valueOf + */ + public static CalendarInterval fromYearMonthString(String s) throws IllegalArgumentException { + CalendarInterval result = null; + if (s == null) { + throw new IllegalArgumentException("Interval year-month string was null"); + } + s = s.trim(); + Matcher m = yearMonthPattern.matcher(s); + if (!m.matches()) { + throw new IllegalArgumentException( + "Interval string does not match year-month format of 'y-m': " + s); + } else { + try { + int sign = m.group(1) != null && m.group(1).equals("-") ? -1 : 1; + int years = (int) toLongWithRange("year", m.group(2), 0, Integer.MAX_VALUE); + int months = (int) toLongWithRange("month", m.group(3), 0, 11); + result = new CalendarInterval(sign * (years * 12 + months), 0); + } catch (Exception e) { + throw new IllegalArgumentException( + "Error parsing interval year-month string: " + e.getMessage(), e); + } + } + return result; + } + + /** + * Parse dayTime string in form: [-]d HH:mm:ss.nnnnnnnnn + * + * adapted from HiveIntervalDayTime.valueOf + */ + public static CalendarInterval fromDayTimeString(String s) throws IllegalArgumentException { + CalendarInterval result = null; + if (s == null) { + throw new IllegalArgumentException("Interval day-time string was null"); + } + s = s.trim(); + Matcher m = dayTimePattern.matcher(s); + if (!m.matches()) { + throw new IllegalArgumentException( + "Interval string does not match day-time format of 'd h:m:s.n': " + s); + } else { + try { + int sign = m.group(1) != null && m.group(1).equals("-") ? -1 : 1; + long days = toLongWithRange("day", m.group(2), 0, Integer.MAX_VALUE); + long hours = toLongWithRange("hour", m.group(3), 0, 23); + long minutes = toLongWithRange("minute", m.group(4), 0, 59); + long seconds = toLongWithRange("second", m.group(5), 0, 59); + // Hive allow nanosecond precision interval + long nanos = toLongWithRange("nanosecond", m.group(7), 0L, 999999999L); + result = new CalendarInterval(0, sign * ( + days * MICROS_PER_DAY + hours * MICROS_PER_HOUR + minutes * MICROS_PER_MINUTE + + seconds * MICROS_PER_SECOND + nanos / 1000L)); + } catch (Exception e) { + throw new IllegalArgumentException( + "Error parsing interval day-time string: " + e.getMessage(), e); + } + } + return result; + } + + public static CalendarInterval fromSingleUnitString(String unit, String s) + throws IllegalArgumentException { + + CalendarInterval result = null; + if (s == null) { + throw new IllegalArgumentException(String.format("Interval %s string was null", unit)); + } + s = s.trim(); + Matcher m = quoteTrimPattern.matcher(s); + if (!m.matches()) { + throw new IllegalArgumentException( + "Interval string does not match day-time format of 'd h:m:s.n': " + s); + } else { + try { + if (unit.equals("year")) { + int year = (int) toLongWithRange("year", m.group(1), + Integer.MIN_VALUE / 12, Integer.MAX_VALUE / 12); + result = new CalendarInterval(year * 12, 0L); + + } else if (unit.equals("month")) { + int month = (int) toLongWithRange("month", m.group(1), + Integer.MIN_VALUE, Integer.MAX_VALUE); + result = new CalendarInterval(month, 0L); + + } else if (unit.equals("day")) { + long day = toLongWithRange("day", m.group(1), + Long.MIN_VALUE / MICROS_PER_DAY, Long.MAX_VALUE / MICROS_PER_DAY); + result = new CalendarInterval(0, day * MICROS_PER_DAY); + + } else if (unit.equals("hour")) { + long hour = toLongWithRange("hour", m.group(1), + Long.MIN_VALUE / MICROS_PER_HOUR, Long.MAX_VALUE / MICROS_PER_HOUR); + result = new CalendarInterval(0, hour * MICROS_PER_HOUR); + + } else if (unit.equals("minute")) { + long minute = toLongWithRange("minute", m.group(1), + Long.MIN_VALUE / MICROS_PER_MINUTE, Long.MAX_VALUE / MICROS_PER_MINUTE); + result = new CalendarInterval(0, minute * MICROS_PER_MINUTE); + + } else if (unit.equals("second")) { + long micros = parseSecondNano(m.group(1)); + result = new CalendarInterval(0, micros); + } + } catch (Exception e) { + throw new IllegalArgumentException("Error parsing interval string: " + e.getMessage(), e); + } + } + return result; + } + + /** + * Parse second_nano string in ss.nnnnnnnnn format to microseconds + */ + public static long parseSecondNano(String secondNano) throws IllegalArgumentException { + String[] parts = secondNano.split("\\."); + if (parts.length == 1) { + return toLongWithRange("second", parts[0], Long.MIN_VALUE / MICROS_PER_SECOND, + Long.MAX_VALUE / MICROS_PER_SECOND) * MICROS_PER_SECOND; + + } else if (parts.length == 2) { + long seconds = parts[0].equals("") ? 0L : toLongWithRange("second", parts[0], + Long.MIN_VALUE / MICROS_PER_SECOND, Long.MAX_VALUE / MICROS_PER_SECOND); + long nanos = toLongWithRange("nanosecond", parts[1], 0L, 999999999L); + return seconds * MICROS_PER_SECOND + nanos / 1000L; + + } else { + throw new IllegalArgumentException( + "Interval string does not match second-nano format of ss.nnnnnnnnn"); + } + } + public final int months; public final long microseconds; diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index f6c9b87778f8..cbcab958c05a 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -18,14 +18,15 @@ package org.apache.spark.unsafe.types; import javax.annotation.Nonnull; -import java.io.Serializable; -import java.io.UnsupportedEncodingException; +import java.io.*; +import java.nio.ByteOrder; import java.util.Arrays; +import java.util.Map; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; -import static org.apache.spark.unsafe.PlatformDependent.*; +import static org.apache.spark.unsafe.Platform.*; /** @@ -36,12 +37,13 @@ *

* Note: This is not designed for general use cases, should not be used outside SQL. */ -public final class UTF8String implements Comparable, Serializable { +public final class UTF8String implements Comparable, Externalizable { + // These are only updated by readExternal() @Nonnull - private final Object base; - private final long offset; - private final int numBytes; + private Object base; + private long offset; + private int numBytes; public Object getBaseObject() { return base; } public long getBaseOffset() { return offset; } @@ -53,6 +55,9 @@ public final class UTF8String implements Comparable, Serializable { 5, 5, 5, 5, 6, 6}; + private static boolean isLittleEndian = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; + + private static final UTF8String COMMA_UTF8 = UTF8String.fromString(","); public static final UTF8String EMPTY_UTF8 = UTF8String.fromString(""); /** @@ -122,19 +127,18 @@ protected UTF8String(Object base, long offset, int numBytes) { this.numBytes = numBytes; } + // for serialization + public UTF8String() { + this(null, 0, 0); + } + /** * Writes the content of this string into a memory address, identified by an object and an offset. * The target memory address must already been allocated, and have enough space to hold all the * bytes in this string. */ public void writeToMemory(Object target, long targetOffset) { - PlatformDependent.copyMemory( - base, - offset, - target, - targetOffset, - numBytes - ); + Platform.copyMemory(base, offset, target, targetOffset, numBytes); } /** @@ -175,18 +179,35 @@ public long getPrefix() { // If size is greater than 4, assume we have at least 8 bytes of data to fetch. // After getting the data, we use a mask to mask out data that is not part of the string. long p; - if (numBytes >= 8) { - p = PlatformDependent.UNSAFE.getLong(base, offset); - } else if (numBytes > 4) { - p = PlatformDependent.UNSAFE.getLong(base, offset); - p = p & ((1L << numBytes * 8) - 1); - } else if (numBytes > 0) { - p = (long) PlatformDependent.UNSAFE.getInt(base, offset); - p = p & ((1L << numBytes * 8) - 1); + long mask = 0; + if (isLittleEndian) { + if (numBytes >= 8) { + p = Platform.getLong(base, offset); + } else if (numBytes > 4) { + p = Platform.getLong(base, offset); + mask = (1L << (8 - numBytes) * 8) - 1; + } else if (numBytes > 0) { + p = (long) Platform.getInt(base, offset); + mask = (1L << (8 - numBytes) * 8) - 1; + } else { + p = 0; + } + p = java.lang.Long.reverseBytes(p); } else { - p = 0; + // byteOrder == ByteOrder.BIG_ENDIAN + if (numBytes >= 8) { + p = Platform.getLong(base, offset); + } else if (numBytes > 4) { + p = Platform.getLong(base, offset); + mask = (1L << (8 - numBytes) * 8) - 1; + } else if (numBytes > 0) { + p = ((long) Platform.getInt(base, offset)) << 32; + mask = (1L << (8 - numBytes) * 8) - 1; + } else { + p = 0; + } } - p = java.lang.Long.reverseBytes(p); + p &= ~mask; return p; } @@ -271,7 +292,7 @@ public boolean contains(final UTF8String substring) { * Returns the byte at position `i`. */ private byte getByte(int i) { - return UNSAFE.getByte(base, offset + i); + return Platform.getByte(base, offset + i); } private boolean matchAt(final UTF8String s, int pos) { @@ -391,6 +412,36 @@ private UTF8String toTitleCaseSlow() { return fromString(sb.toString()); } + /* + * Returns the index of the string `match` in this String. This string has to be a comma separated + * list. If `match` contains a comma 0 will be returned. If the `match` isn't part of this String, + * 0 will be returned, else the index of match (1-based index) + */ + public int findInSet(UTF8String match) { + if (match.contains(COMMA_UTF8)) { + return 0; + } + + int n = 1, lastComma = -1; + for (int i = 0; i < numBytes; i++) { + if (getByte(i) == (byte) ',') { + if (i - (lastComma + 1) == match.numBytes && + ByteArrayMethods.arrayEquals(base, offset + (lastComma + 1), match.base, match.offset, + match.numBytes)) { + return n; + } + lastComma = i; + n++; + } + } + if (numBytes - (lastComma + 1) == match.numBytes && + ByteArrayMethods.arrayEquals(base, offset + (lastComma + 1), match.base, match.offset, + match.numBytes)) { + return n; + } + return 0; + } + /** * Copy the bytes from the current UTF8String, and make a new UTF8String. * @param start the start position of the current UTF8String in bytes. @@ -717,7 +768,7 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) { int len = inputs[i].numBytes; copyMemory( inputs[i].base, inputs[i].offset, - result, PlatformDependent.BYTE_ARRAY_OFFSET + offset, + result, BYTE_ARRAY_OFFSET + offset, len); offset += len; @@ -726,7 +777,7 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) { if (j < numInputs) { copyMemory( separator.base, separator.offset, - result, PlatformDependent.BYTE_ARRAY_OFFSET + offset, + result, BYTE_ARRAY_OFFSET + offset, separator.numBytes); offset += separator.numBytes; } @@ -744,6 +795,21 @@ public UTF8String[] split(UTF8String pattern, int limit) { return res; } + // TODO: Need to use `Code Point` here instead of Char in case the character longer than 2 bytes + public UTF8String translate(Map dict) { + String srcStr = this.toString(); + + StringBuilder sb = new StringBuilder(); + for(int k = 0; k< srcStr.length(); k++) { + if (null == dict.get(srcStr.charAt(k))) { + sb.append(srcStr.charAt(k)); + } else if ('\0' != dict.get(srcStr.charAt(k))){ + sb.append(dict.get(srcStr.charAt(k))); + } + } + return fromString(sb.toString()); + } + @Override public String toString() { try { @@ -917,4 +983,18 @@ public UTF8String soundex() { } return UTF8String.fromBytes(sx); } + + public void writeExternal(ObjectOutput out) throws IOException { + byte[] bytes = getBytes(); + out.writeInt(bytes.length); + out.write(bytes); + } + + public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { + offset = BYTE_ARRAY_OFFSET; + numBytes = in.readInt(); + base = new byte[numBytes]; + in.readFully((byte[]) base); + } + } diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java b/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java index 3b9175835229..2f8cb132ac8b 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java @@ -22,7 +22,7 @@ import java.util.Set; import junit.framework.Assert; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.junit.Test; /** @@ -83,11 +83,11 @@ public void randomizedStressTestBytes() { rand.nextBytes(bytes); Assert.assertEquals( - hasher.hashUnsafeWords(bytes, PlatformDependent.BYTE_ARRAY_OFFSET, byteArrSize), - hasher.hashUnsafeWords(bytes, PlatformDependent.BYTE_ARRAY_OFFSET, byteArrSize)); + hasher.hashUnsafeWords(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize), + hasher.hashUnsafeWords(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); hashcodes.add(hasher.hashUnsafeWords( - bytes, PlatformDependent.BYTE_ARRAY_OFFSET, byteArrSize)); + bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); } // A very loose bound. @@ -106,11 +106,11 @@ public void randomizedStressTestPaddedStrings() { System.arraycopy(strBytes, 0, paddedBytes, 0, strBytes.length); Assert.assertEquals( - hasher.hashUnsafeWords(paddedBytes, PlatformDependent.BYTE_ARRAY_OFFSET, byteArrSize), - hasher.hashUnsafeWords(paddedBytes, PlatformDependent.BYTE_ARRAY_OFFSET, byteArrSize)); + hasher.hashUnsafeWords(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize), + hasher.hashUnsafeWords(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); hashcodes.add(hasher.hashUnsafeWords( - paddedBytes, PlatformDependent.BYTE_ARRAY_OFFSET, byteArrSize)); + paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); } // A very loose bound. diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java index 6274b92b47dd..80d4982c4b57 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java @@ -101,6 +101,97 @@ public void fromStringTest() { assertEquals(CalendarInterval.fromString(input), null); } + @Test + public void fromYearMonthStringTest() { + String input; + CalendarInterval i; + + input = "99-10"; + i = new CalendarInterval(99 * 12 + 10, 0L); + assertEquals(CalendarInterval.fromYearMonthString(input), i); + + input = "-8-10"; + i = new CalendarInterval(-8 * 12 - 10, 0L); + assertEquals(CalendarInterval.fromYearMonthString(input), i); + + try { + input = "99-15"; + CalendarInterval.fromYearMonthString(input); + fail("Expected to throw an exception for the invalid input"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("month 15 outside range")); + } + } + + @Test + public void fromDayTimeStringTest() { + String input; + CalendarInterval i; + + input = "5 12:40:30.999999999"; + i = new CalendarInterval(0, 5 * MICROS_PER_DAY + 12 * MICROS_PER_HOUR + + 40 * MICROS_PER_MINUTE + 30 * MICROS_PER_SECOND + 999999L); + assertEquals(CalendarInterval.fromDayTimeString(input), i); + + input = "10 0:12:0.888"; + i = new CalendarInterval(0, 10 * MICROS_PER_DAY + 12 * MICROS_PER_MINUTE); + assertEquals(CalendarInterval.fromDayTimeString(input), i); + + input = "-3 0:0:0"; + i = new CalendarInterval(0, -3 * MICROS_PER_DAY); + assertEquals(CalendarInterval.fromDayTimeString(input), i); + + try { + input = "5 30:12:20"; + CalendarInterval.fromDayTimeString(input); + fail("Expected to throw an exception for the invalid input"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("hour 30 outside range")); + } + + try { + input = "5 30-12"; + CalendarInterval.fromDayTimeString(input); + fail("Expected to throw an exception for the invalid input"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("not match day-time format")); + } + } + + @Test + public void fromSingleUnitStringTest() { + String input; + CalendarInterval i; + + input = "12"; + i = new CalendarInterval(12 * 12, 0L); + assertEquals(CalendarInterval.fromSingleUnitString("year", input), i); + + input = "100"; + i = new CalendarInterval(0, 100 * MICROS_PER_DAY); + assertEquals(CalendarInterval.fromSingleUnitString("day", input), i); + + input = "1999.38888"; + i = new CalendarInterval(0, 1999 * MICROS_PER_SECOND + 38); + assertEquals(CalendarInterval.fromSingleUnitString("second", input), i); + + try { + input = String.valueOf(Integer.MAX_VALUE); + CalendarInterval.fromSingleUnitString("year", input); + fail("Expected to throw an exception for the invalid input"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("outside range")); + } + + try { + input = String.valueOf(Long.MAX_VALUE / MICROS_PER_HOUR + 1); + CalendarInterval.fromSingleUnitString("hour", input); + fail("Expected to throw an exception for the invalid input"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("outside range")); + } + } + @Test public void addTest() { String input = "interval 3 month 1 hour"; diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 9b3190f8f0c3..98aa8a2469a7 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -19,7 +19,9 @@ import java.io.UnsupportedEncodingException; import java.util.Arrays; +import java.util.HashMap; +import com.google.common.collect.ImmutableMap; import org.junit.Test; import static junit.framework.Assert.*; @@ -391,6 +393,35 @@ public void levenshteinDistance() { assertEquals(fromString("世界千世").levenshteinDistance(fromString("千a世b")),4); } + @Test + public void translate() { + assertEquals( + fromString("1a2s3ae"), + fromString("translate").translate(ImmutableMap.of( + 'r', '1', + 'n', '2', + 'l', '3', + 't', '\0' + ))); + assertEquals( + fromString("translate"), + fromString("translate").translate(new HashMap())); + assertEquals( + fromString("asae"), + fromString("translate").translate(ImmutableMap.of( + 'r', '\0', + 'n', '\0', + 'l', '\0', + 't', '\0' + ))); + assertEquals( + fromString("aa世b"), + fromString("花花世界").translate(ImmutableMap.of( + '花', 'a', + '界', 'b' + ))); + } + @Test public void createBlankString() { assertEquals(fromString(" "), blankString(1)); @@ -399,6 +430,18 @@ public void createBlankString() { assertEquals(fromString(""), blankString(0)); } + @Test + public void findInSet() { + assertEquals(fromString("ab").findInSet(fromString("ab")), 1); + assertEquals(fromString("a,b").findInSet(fromString("b")), 2); + assertEquals(fromString("abc,b,ab,c,def").findInSet(fromString("ab")), 3); + assertEquals(fromString("ab,abc,b,ab,c,def").findInSet(fromString("ab")), 1); + assertEquals(fromString(",,,ab,abc,b,ab,c,def").findInSet(fromString("ab")), 4); + assertEquals(fromString(",ab,abc,b,ab,c,def").findInSet(fromString("")), 1); + assertEquals(fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("ab")), 4); + assertEquals(fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("def")), 6); + } + @Test public void soundex() { assertEquals(fromString("Robert").soundex(), fromString("R163")); diff --git a/yarn/pom.xml b/yarn/pom.xml index 2aeed98285aa..60d4e9f3b79a 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.5.0-SNAPSHOT + 1.5.1-SNAPSHOT ../pom.xml @@ -30,7 +30,6 @@ Spark Project YARN yarn - 1.9 @@ -93,12 +92,28 @@ jetty-servlet - + + + + org.eclipse.jetty.orbit + javax.servlet.jsp + 2.2.0.v201112011158 + test + + + org.eclipse.jetty.orbit + javax.servlet.jsp.jstl + 1.2.0.v201105211821 + test + + - + org.apache.hadoop hadoop-yarn-server-tests @@ -125,29 +140,20 @@ com.sun.jersey jersey-core - ${jersey.version} test com.sun.jersey jersey-json - ${jersey.version} test - - - stax - stax-api - - com.sun.jersey jersey-server - ${jersey.version} test - + target/scala-${scala.binary.version}/classes target/scala-${scala.binary.version}/test-classes diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 1d67b3ebb51b..991b5cec00bd 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -30,8 +30,8 @@ import org.apache.hadoop.yarn.api.records._ import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.spark.rpc._ -import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, SparkEnv} -import org.apache.spark.SparkException +import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, SparkEnv, + SparkException, SparkUserAppException} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.history.HistoryServer import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, YarnSchedulerBackend} @@ -64,7 +64,8 @@ private[spark] class ApplicationMaster( // Default to numExecutors * 2, with minimum of 3 private val maxNumExecutorFailures = sparkConf.getInt("spark.yarn.max.executor.failures", - sparkConf.getInt("spark.yarn.max.worker.failures", math.max(args.numExecutors * 2, 3))) + sparkConf.getInt("spark.yarn.max.worker.failures", + math.max(sparkConf.getInt("spark.executor.instances", 0) * 2, 3))) @volatile private var exitCode = 0 @volatile private var unregistered = false @@ -111,7 +112,8 @@ private[spark] class ApplicationMaster( val fs = FileSystem.get(yarnConf) // This shutdown hook should run *after* the SparkContext is shut down. - Utils.addShutdownHook(Utils.SPARK_CONTEXT_SHUTDOWN_PRIORITY - 1) { () => + val priority = ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY - 1 + ShutdownHookManager.addShutdownHook(priority) { () => val maxAppAttempts = client.getMaxRegAttempts(sparkConf, yarnConf) val isLastAttempt = client.getAttemptId().getAttemptId() >= maxAppAttempts @@ -198,7 +200,7 @@ private[spark] class ApplicationMaster( final def finish(status: FinalApplicationStatus, code: Int, msg: String = null): Unit = { synchronized { if (!finished) { - val inShutdown = Utils.inShutdown() + val inShutdown = ShutdownHookManager.inShutdown() logInfo(s"Final app status: $status, exitCode: $code" + Option(msg).map(msg => s", (reason: $msg)").getOrElse("")) exitCode = code @@ -493,7 +495,6 @@ private[spark] class ApplicationMaster( */ private def startUserApplication(): Thread = { logInfo("Starting the user application in a separate Thread") - System.setProperty("spark.executor.instances", args.numExecutors.toString) val classpath = Client.getUserClasspath(sparkConf) val urls = classpath.map { entry => @@ -529,6 +530,10 @@ private[spark] class ApplicationMaster( e.getCause match { case _: InterruptedException => // Reporter thread can interrupt to stop user class + case SparkUserAppException(exitCode) => + val msg = s"User application exited with status $exitCode" + logError(msg) + finish(FinalApplicationStatus.FAILED, exitCode, msg) case cause: Throwable => logError("User class threw exception: " + cause, cause) finish(FinalApplicationStatus.FAILED, diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala index 37f793763367..b08412414aa1 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala @@ -29,7 +29,6 @@ class ApplicationMasterArguments(val args: Array[String]) { var userArgs: Seq[String] = Nil var executorMemory = 1024 var executorCores = 1 - var numExecutors = DEFAULT_NUMBER_EXECUTORS var propertiesFile: String = null parseArgs(args.toList) @@ -63,10 +62,6 @@ class ApplicationMasterArguments(val args: Array[String]) { userArgsBuffer += value args = tail - case ("--num-workers" | "--num-executors") :: IntParam(value) :: tail => - numExecutors = value - args = tail - case ("--worker-memory" | "--executor-memory") :: MemoryParam(value) :: tail => executorMemory = value args = tail diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index fc11bbf97e2e..7cedcbcdf3cd 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -203,12 +203,14 @@ private[spark] class Client( val executorMem = args.executorMemory + executorMemoryOverhead if (executorMem > maxMem) { throw new IllegalArgumentException(s"Required executor memory (${args.executorMemory}" + - s"+$executorMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster!") + s"+$executorMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster! " + + "Please increase the value of 'yarn.scheduler.maximum-allocation-mb'.") } val amMem = args.amMemory + amMemoryOverhead if (amMem > maxMem) { throw new IllegalArgumentException(s"Required AM memory (${args.amMemory}" + - s"+$amMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster!") + s"+$amMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster! " + + "Please increase the value of 'yarn.scheduler.maximum-allocation-mb'.") } logInfo("Will allocate AM container, with %d MB memory including %d MB overhead".format( amMem, @@ -412,7 +414,7 @@ private[spark] class Client( } // Distribute an archive with Hadoop and Spark configuration for the AM. - val (_, confLocalizedPath) = distribute(createConfArchive().getAbsolutePath(), + val (_, confLocalizedPath) = distribute(createConfArchive().toURI().getPath(), resType = LocalResourceType.ARCHIVE, destName = Some(LOCALIZED_CONF_DIR), appMasterOnly = true) @@ -749,7 +751,6 @@ private[spark] class Client( userArgs ++ Seq( "--executor-memory", args.executorMemory.toString + "m", "--executor-cores", args.executorCores.toString, - "--num-executors ", args.numExecutors.toString, "--properties-file", buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), LOCALIZED_CONF_DIR, SPARK_CONF_FILE)) @@ -958,6 +959,10 @@ object Client extends Logging { val sparkConf = new SparkConf val args = new ClientArguments(argStrings, sparkConf) + // to maintain backwards-compatibility + if (!Utils.isDynamicAllocationEnabled(sparkConf)) { + sparkConf.setIfMissing("spark.executor.instances", args.numExecutors.toString) + } new Client(args, sparkConf).run() } diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala index 20d63d40cf60..54f62e6b723a 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala @@ -53,8 +53,7 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) private val amMemOverheadKey = "spark.yarn.am.memoryOverhead" private val driverCoresKey = "spark.driver.cores" private val amCoresKey = "spark.yarn.am.cores" - private val isDynamicAllocationEnabled = - sparkConf.getBoolean("spark.dynamicAllocation.enabled", false) + private val isDynamicAllocationEnabled = Utils.isDynamicAllocationEnabled(sparkConf) parseArgs(args.toList) loadEnvironmentArgs() @@ -97,6 +96,9 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) } numExecutors = initialNumExecutors + } else { + val numExecutorsConf = "spark.executor.instances" + numExecutors = sparkConf.getInt(numExecutorsConf, numExecutors) } principal = Option(principal) .orElse(sparkConf.getOption("spark.yarn.principal")) @@ -196,11 +198,6 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) if (args(0) == "--num-workers") { println("--num-workers is deprecated. Use --num-executors instead.") } - // Dynamic allocation is not compatible with this option - if (isDynamicAllocationEnabled) { - throw new IllegalArgumentException("Explicitly setting the number " + - "of executors is not compatible with spark.dynamicAllocation.enabled!") - } numExecutors = value args = tail diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index 59caa787b6e2..ccf753e69f4b 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -21,6 +21,8 @@ import java.util.Collections import java.util.concurrent._ import java.util.regex.Pattern +import org.apache.spark.util.Utils + import scala.collection.JavaConversions._ import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} @@ -86,7 +88,12 @@ private[yarn] class YarnAllocator( private var executorIdCounter = 0 @volatile private var numExecutorsFailed = 0 - @volatile private var targetNumExecutors = args.numExecutors + @volatile private var targetNumExecutors = + if (Utils.isDynamicAllocationEnabled(sparkConf)) { + sparkConf.getInt("spark.dynamicAllocation.initialExecutors", 0) + } else { + sparkConf.getInt("spark.executor.instances", YarnSparkHadoopUtil.DEFAULT_NUMBER_EXECUTORS) + } // Keep track of which container is running which executor to remove the executors later // Visible for testing. diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala index d97fa2e2151b..d06d95140438 100644 --- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala +++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala @@ -33,7 +33,7 @@ private[spark] class YarnClientSchedulerBackend( private var client: Client = null private var appId: ApplicationId = null - private var monitorThread: Thread = null + private var monitorThread: MonitorThread = null /** * Create a Yarn client to submit an application to the ResourceManager. @@ -81,8 +81,6 @@ private[spark] class YarnClientSchedulerBackend( // List of (target Client argument, environment variable, Spark property) val optionTuples = List( - ("--num-executors", "SPARK_WORKER_INSTANCES", "spark.executor.instances"), - ("--num-executors", "SPARK_EXECUTOR_INSTANCES", "spark.executor.instances"), ("--executor-memory", "SPARK_WORKER_MEMORY", "spark.executor.memory"), ("--executor-memory", "SPARK_EXECUTOR_MEMORY", "spark.executor.memory"), ("--executor-cores", "SPARK_WORKER_CORES", "spark.executor.cores"), @@ -92,7 +90,6 @@ private[spark] class YarnClientSchedulerBackend( ) // Warn against the following deprecated environment variables: env var -> suggestion val deprecatedEnvVars = Map( - "SPARK_WORKER_INSTANCES" -> "SPARK_WORKER_INSTANCES or --num-executors through spark-submit", "SPARK_WORKER_MEMORY" -> "SPARK_EXECUTOR_MEMORY or --executor-memory through spark-submit", "SPARK_WORKER_CORES" -> "SPARK_EXECUTOR_CORES or --executor-cores through spark-submit") optionTuples.foreach { case (optionName, envVar, sparkProp) => @@ -131,24 +128,42 @@ private[spark] class YarnClientSchedulerBackend( } } + /** + * We create this class for SPARK-9519. Basically when we interrupt the monitor thread it's + * because the SparkContext is being shut down(sc.stop() called by user code), but if + * monitorApplication return, it means the Yarn application finished before sc.stop() was called, + * which means we should call sc.stop() here, and we don't allow the monitor to be interrupted + * before SparkContext stops successfully. + */ + private class MonitorThread extends Thread { + private var allowInterrupt = true + + override def run() { + try { + val (state, _) = client.monitorApplication(appId, logApplicationReport = false) + logError(s"Yarn application has already exited with state $state!") + allowInterrupt = false + sc.stop() + } catch { + case e: InterruptedException => logInfo("Interrupting monitor thread") + } + } + + def stopMonitor(): Unit = { + if (allowInterrupt) { + this.interrupt() + } + } + } + /** * Monitor the application state in a separate thread. * If the application has exited for any reason, stop the SparkContext. * This assumes both `client` and `appId` have already been set. */ - private def asyncMonitorApplication(): Thread = { + private def asyncMonitorApplication(): MonitorThread = { assert(client != null && appId != null, "Application has not been submitted yet!") - val t = new Thread { - override def run() { - try { - val (state, _) = client.monitorApplication(appId, logApplicationReport = false) - logError(s"Yarn application has already exited with state $state!") - sc.stop() - } catch { - case e: InterruptedException => logInfo("Interrupting monitor thread") - } - } - } + val t = new MonitorThread t.setName("Yarn application state monitor") t.setDaemon(true) t @@ -160,7 +175,7 @@ private[spark] class YarnClientSchedulerBackend( override def stop() { assert(client != null, "Attempted to stop this scheduler before starting it!") if (monitorThread != null) { - monitorThread.interrupt() + monitorThread.stopMonitor() } super.stop() client.stop() @@ -174,5 +189,4 @@ private[spark] class YarnClientSchedulerBackend( super.applicationId } } - } diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala index 58318bf9bcc0..5d05f514adde 100644 --- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala +++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala @@ -87,16 +87,17 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter def createAllocator(maxExecutors: Int = 5): YarnAllocator = { val args = Array( - "--num-executors", s"$maxExecutors", "--executor-cores", "5", "--executor-memory", "2048", "--jar", "somejar.jar", "--class", "SomeClass") + val sparkConfClone = sparkConf.clone() + sparkConfClone.set("spark.executor.instances", maxExecutors.toString) new YarnAllocator( "not used", mock(classOf[RpcEndpointRef]), conf, - sparkConf, + sparkConfClone, rmClient, appAttemptId, new ApplicationMasterArguments(args), diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala index 547863d9a073..eb6e1fd37062 100644 --- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala +++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala @@ -384,19 +384,29 @@ private object YarnClusterDriver extends Logging with Matchers { } -private object YarnClasspathTest { +private object YarnClasspathTest extends Logging { + + var exitCode = 0 + + def error(m: String, ex: Throwable = null): Unit = { + logError(m, ex) + // scalastyle:off println + System.out.println(m) + if (ex != null) { + ex.printStackTrace(System.out) + } + // scalastyle:on println + } def main(args: Array[String]): Unit = { if (args.length != 2) { - // scalastyle:off println - System.err.println( + error( s""" |Invalid command line: ${args.mkString(" ")} | |Usage: YarnClasspathTest [driver result file] [executor result file] """.stripMargin) // scalastyle:on println - System.exit(1) } readResource(args(0)) @@ -406,6 +416,7 @@ private object YarnClasspathTest { } finally { sc.stop() } + System.exit(exitCode) } private def readResource(resultPath: String): Unit = { @@ -415,6 +426,11 @@ private object YarnClasspathTest { val resource = ccl.getResourceAsStream("test.resource") val bytes = ByteStreams.toByteArray(resource) result = new String(bytes, 0, bytes.length, UTF_8) + } catch { + case t: Throwable => + error(s"loading test.resource to $resultPath", t) + // set the exit code if not yet set + exitCode = 2 } finally { Files.write(result, new File(resultPath), UTF_8) }