diff --git a/LICENSE b/LICENSE index 0db2d14465bd3..dce8046e936ee 100644 --- a/LICENSE +++ b/LICENSE @@ -211,38 +211,708 @@ subcomponents is subject to the terms and conditions of the following licenses. +======================================================================= +For the Boto EC2 library (ec2/third_party/boto*.zip): +======================================================================= + +Copyright (c) 2006-2008 Mitch Garnaat http://garnaat.org/ + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, dis- +tribute, sublicense, and/or sell copies of the Software, and to permit +persons to whom the Software is furnished to do so, subject to the fol- +lowing conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- +ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + + +======================================================================== +For CloudPickle (pyspark/cloudpickle.py): +======================================================================== + +Copyright (c) 2012, Regents of the University of California. +Copyright (c) 2009 `PiCloud, Inc. `_. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the University of California, Berkeley nor the + names of its contributors may be used to endorse or promote + products derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +======================================================================== +For Py4J (python/lib/py4j-0.8.2.1-src.zip) +======================================================================== + +Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +- The name of the author may not be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +======================================================================== +For DPark join code (python/pyspark/join.py): +======================================================================== + +Copyright (c) 2011, Douban Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + + * Neither the name of the Douban Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ======================================================================== For heapq (pyspark/heapq3.py): ======================================================================== -See license/LICENSE-heapq.txt +# A. HISTORY OF THE SOFTWARE +# ========================== +# +# Python was created in the early 1990s by Guido van Rossum at Stichting +# Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands +# as a successor of a language called ABC. Guido remains Python's +# principal author, although it includes many contributions from others. +# +# In 1995, Guido continued his work on Python at the Corporation for +# National Research Initiatives (CNRI, see http://www.cnri.reston.va.us) +# in Reston, Virginia where he released several versions of the +# software. +# +# In May 2000, Guido and the Python core development team moved to +# BeOpen.com to form the BeOpen PythonLabs team. In October of the same +# year, the PythonLabs team moved to Digital Creations (now Zope +# Corporation, see http://www.zope.com). In 2001, the Python Software +# Foundation (PSF, see http://www.python.org/psf/) was formed, a +# non-profit organization created specifically to own Python-related +# Intellectual Property. Zope Corporation is a sponsoring member of +# the PSF. +# +# All Python releases are Open Source (see http://www.opensource.org for +# the Open Source Definition). Historically, most, but not all, Python +# releases have also been GPL-compatible; the table below summarizes +# the various releases. +# +# Release Derived Year Owner GPL- +# from compatible? (1) +# +# 0.9.0 thru 1.2 1991-1995 CWI yes +# 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes +# 1.6 1.5.2 2000 CNRI no +# 2.0 1.6 2000 BeOpen.com no +# 1.6.1 1.6 2001 CNRI yes (2) +# 2.1 2.0+1.6.1 2001 PSF no +# 2.0.1 2.0+1.6.1 2001 PSF yes +# 2.1.1 2.1+2.0.1 2001 PSF yes +# 2.2 2.1.1 2001 PSF yes +# 2.1.2 2.1.1 2002 PSF yes +# 2.1.3 2.1.2 2002 PSF yes +# 2.2.1 2.2 2002 PSF yes +# 2.2.2 2.2.1 2002 PSF yes +# 2.2.3 2.2.2 2003 PSF yes +# 2.3 2.2.2 2002-2003 PSF yes +# 2.3.1 2.3 2002-2003 PSF yes +# 2.3.2 2.3.1 2002-2003 PSF yes +# 2.3.3 2.3.2 2002-2003 PSF yes +# 2.3.4 2.3.3 2004 PSF yes +# 2.3.5 2.3.4 2005 PSF yes +# 2.4 2.3 2004 PSF yes +# 2.4.1 2.4 2005 PSF yes +# 2.4.2 2.4.1 2005 PSF yes +# 2.4.3 2.4.2 2006 PSF yes +# 2.4.4 2.4.3 2006 PSF yes +# 2.5 2.4 2006 PSF yes +# 2.5.1 2.5 2007 PSF yes +# 2.5.2 2.5.1 2008 PSF yes +# 2.5.3 2.5.2 2008 PSF yes +# 2.6 2.5 2008 PSF yes +# 2.6.1 2.6 2008 PSF yes +# 2.6.2 2.6.1 2009 PSF yes +# 2.6.3 2.6.2 2009 PSF yes +# 2.6.4 2.6.3 2009 PSF yes +# 2.6.5 2.6.4 2010 PSF yes +# 2.7 2.6 2010 PSF yes +# +# Footnotes: +# +# (1) GPL-compatible doesn't mean that we're distributing Python under +# the GPL. All Python licenses, unlike the GPL, let you distribute +# a modified version without making your changes open source. The +# GPL-compatible licenses make it possible to combine Python with +# other software that is released under the GPL; the others don't. +# +# (2) According to Richard Stallman, 1.6.1 is not GPL-compatible, +# because its license has a choice of law clause. According to +# CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 +# is "not incompatible" with the GPL. +# +# Thanks to the many outside volunteers who have worked under Guido's +# direction to make these releases possible. +# +# +# B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON +# =============================================================== +# +# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +# -------------------------------------------- +# +# 1. This LICENSE AGREEMENT is between the Python Software Foundation +# ("PSF"), and the Individual or Organization ("Licensee") accessing and +# otherwise using this software ("Python") in source or binary form and +# its associated documentation. +# +# 2. Subject to the terms and conditions of this License Agreement, PSF hereby +# grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, +# analyze, test, perform and/or display publicly, prepare derivative works, +# distribute, and otherwise use Python alone or in any derivative version, +# provided, however, that PSF's License Agreement and PSF's notice of copyright, +# i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +# 2011, 2012, 2013 Python Software Foundation; All Rights Reserved" are retained +# in Python alone or in any derivative version prepared by Licensee. +# +# 3. In the event Licensee prepares a derivative work that is based on +# or incorporates Python or any part thereof, and wants to make +# the derivative work available to others as provided herein, then +# Licensee hereby agrees to include in any such work a brief summary of +# the changes made to Python. +# +# 4. PSF is making Python available to Licensee on an "AS IS" +# basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +# IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +# INFRINGE ANY THIRD PARTY RIGHTS. +# +# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +# FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. +# +# 6. This License Agreement will automatically terminate upon a material +# breach of its terms and conditions. +# +# 7. Nothing in this License Agreement shall be deemed to create any +# relationship of agency, partnership, or joint venture between PSF and +# Licensee. This License Agreement does not grant permission to use PSF +# trademarks or trade name in a trademark sense to endorse or promote +# products or services of Licensee, or any third party. +# +# 8. By copying, installing or otherwise using Python, Licensee +# agrees to be bound by the terms and conditions of this License +# Agreement. +# +# +# BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 +# ------------------------------------------- +# +# BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 +# +# 1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an +# office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the +# Individual or Organization ("Licensee") accessing and otherwise using +# this software in source or binary form and its associated +# documentation ("the Software"). +# +# 2. Subject to the terms and conditions of this BeOpen Python License +# Agreement, BeOpen hereby grants Licensee a non-exclusive, +# royalty-free, world-wide license to reproduce, analyze, test, perform +# and/or display publicly, prepare derivative works, distribute, and +# otherwise use the Software alone or in any derivative version, +# provided, however, that the BeOpen Python License is retained in the +# Software, alone or in any derivative version prepared by Licensee. +# +# 3. BeOpen is making the Software available to Licensee on an "AS IS" +# basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +# IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND +# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT +# INFRINGE ANY THIRD PARTY RIGHTS. +# +# 4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE +# SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS +# AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY +# DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. +# +# 5. This License Agreement will automatically terminate upon a material +# breach of its terms and conditions. +# +# 6. This License Agreement shall be governed by and interpreted in all +# respects by the law of the State of California, excluding conflict of +# law provisions. Nothing in this License Agreement shall be deemed to +# create any relationship of agency, partnership, or joint venture +# between BeOpen and Licensee. This License Agreement does not grant +# permission to use BeOpen trademarks or trade names in a trademark +# sense to endorse or promote products or services of Licensee, or any +# third party. As an exception, the "BeOpen Python" logos available at +# http://www.pythonlabs.com/logos.html may be used according to the +# permissions granted on that web page. +# +# 7. By copying, installing or otherwise using the software, Licensee +# agrees to be bound by the terms and conditions of this License +# Agreement. +# +# +# CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 +# --------------------------------------- +# +# 1. This LICENSE AGREEMENT is between the Corporation for National +# Research Initiatives, having an office at 1895 Preston White Drive, +# Reston, VA 20191 ("CNRI"), and the Individual or Organization +# ("Licensee") accessing and otherwise using Python 1.6.1 software in +# source or binary form and its associated documentation. +# +# 2. Subject to the terms and conditions of this License Agreement, CNRI +# hereby grants Licensee a nonexclusive, royalty-free, world-wide +# license to reproduce, analyze, test, perform and/or display publicly, +# prepare derivative works, distribute, and otherwise use Python 1.6.1 +# alone or in any derivative version, provided, however, that CNRI's +# License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) +# 1995-2001 Corporation for National Research Initiatives; All Rights +# Reserved" are retained in Python 1.6.1 alone or in any derivative +# version prepared by Licensee. Alternately, in lieu of CNRI's License +# Agreement, Licensee may substitute the following text (omitting the +# quotes): "Python 1.6.1 is made available subject to the terms and +# conditions in CNRI's License Agreement. This Agreement together with +# Python 1.6.1 may be located on the Internet using the following +# unique, persistent identifier (known as a handle): 1895.22/1013. This +# Agreement may also be obtained from a proxy server on the Internet +# using the following URL: http://hdl.handle.net/1895.22/1013". +# +# 3. In the event Licensee prepares a derivative work that is based on +# or incorporates Python 1.6.1 or any part thereof, and wants to make +# the derivative work available to others as provided herein, then +# Licensee hereby agrees to include in any such work a brief summary of +# the changes made to Python 1.6.1. +# +# 4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" +# basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +# IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND +# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT +# INFRINGE ANY THIRD PARTY RIGHTS. +# +# 5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +# 1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, +# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. +# +# 6. This License Agreement will automatically terminate upon a material +# breach of its terms and conditions. +# +# 7. This License Agreement shall be governed by the federal +# intellectual property law of the United States, including without +# limitation the federal copyright law, and, to the extent such +# U.S. federal law does not apply, by the law of the Commonwealth of +# Virginia, excluding Virginia's conflict of law provisions. +# Notwithstanding the foregoing, with regard to derivative works based +# on Python 1.6.1 that incorporate non-separable material that was +# previously distributed under the GNU General Public License (GPL), the +# law of the Commonwealth of Virginia shall govern this License +# Agreement only as to issues arising under or with respect to +# Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this +# License Agreement shall be deemed to create any relationship of +# agency, partnership, or joint venture between CNRI and Licensee. This +# License Agreement does not grant permission to use CNRI trademarks or +# trade name in a trademark sense to endorse or promote products or +# services of Licensee, or any third party. +# +# 8. By clicking on the "ACCEPT" button where indicated, or by copying, +# installing or otherwise using Python 1.6.1, Licensee agrees to be +# bound by the terms and conditions of this License Agreement. +# +# ACCEPT +# +# +# CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 +# -------------------------------------------------- +# +# Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, +# The Netherlands. All rights reserved. +# +# Permission to use, copy, modify, and distribute this software and its +# documentation for any purpose and without fee is hereby granted, +# provided that the above copyright notice appear in all copies and that +# both that copyright notice and this permission notice appear in +# supporting documentation, and that the name of Stichting Mathematisch +# Centrum or CWI not be used in advertising or publicity pertaining to +# distribution of the software without specific, written prior +# permission. +# +# STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO +# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE +# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +======================================================================== +For sorttable (core/src/main/resources/org/apache/spark/ui/static/sorttable.js): +======================================================================== + +Copyright (c) 1997-2007 Stuart Langridge + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +======================================================================== +For d3 (core/src/main/resources/org/apache/spark/ui/static/d3.min.js): +======================================================================== + +Copyright (c) 2010-2015, Michael Bostock +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* The name Michael Bostock may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +======================================================================== +For Scala Interpreter classes (all .scala files in repl/src/main/scala +except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala), +and for SerializableMapWrapper in JavaUtils.scala: +======================================================================== + +Copyright (c) 2002-2013 EPFL +Copyright (c) 2011-2013 Typesafe, Inc. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +- Neither the name of the EPFL nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +======================================================================== +For sbt and sbt-launch-lib.bash in sbt/: +======================================================================== + +// Generated from http://www.opensource.org/licenses/bsd-license.php +Copyright (c) 2011, Paul Phillips. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the author nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ======================================================================== For SnapTree: ======================================================================== -See license/LICENSE-SnapTree.txt +SNAPTREE LICENSE + +Copyright (c) 2009-2012 Stanford University, unless otherwise specified. +All rights reserved. + +This software was developed by the Pervasive Parallelism Laboratory of +Stanford University, California, USA. + +Permission to use, copy, modify, and distribute this software in source +or binary form for any purpose with or without fee is hereby granted, +provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of Stanford University nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + + +======================================================================== +For Timsort (core/src/main/java/org/apache/spark/util/collection/TimSort.java): +======================================================================== +Copyright (C) 2008 The Android Open Source Project + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +======================================================================== +For TestTimSort (core/src/test/java/org/apache/spark/util/collection/TestTimSort.java): +======================================================================== +Copyright (C) 2015 Stijn de Gouw + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +======================================================================== +For LimitedInputStream + (network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java): +======================================================================== +Copyright (C) 2007 The Guava Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +======================================================================== +For vis.js (core/src/main/resources/org/apache/spark/ui/static/vis.min.js): +======================================================================== +Copyright (C) 2010-2015 Almende B.V. + +Vis.js is dual licensed under both + + * The Apache 2.0 License + http://www.apache.org/licenses/LICENSE-2.0 + +and + + * The MIT License + http://opensource.org/licenses/MIT + +Vis.js may be distributed under either license. ======================================================================== -For jbcrypt: +For dagre-d3 (core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js): ======================================================================== +Copyright (c) 2013 Chris Pettitt + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. -See license/LICENSE-jbcrypt.txt +======================================================================== +For graphlib-dot (core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js): +======================================================================== +Copyright (c) 2012-2013 Chris Pettitt + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. ======================================================================== BSD-style licenses ======================================================================== The following components are provided under a BSD-style license. See project link for details. -The text of each license is also included at licenses/LICENSE-[project].txt. - (BSD 3 Clause) netlib core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core) + (BSD 3 Clause) core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core) (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.1.15 - https://github.com/jpmml/jpmml-model) (BSD 3-clause style license) jblas (org.jblas:jblas:1.2.4 - http://jblas.org/) (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/) + (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org) (BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org) (BSD licence) ANTLR StringTemplate (org.antlr:stringtemplate:3.2.1 - http://www.stringtemplate.org) - (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org) + (BSD style) Hamcrest Core (org.hamcrest:hamcrest-core:1.1 - no url defined) (BSD) JLine (jline:jline:0.9.94 - http://jline.sourceforge.net) (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.3 - http://paranamer.codehaus.org/paranamer) (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.6 - http://paranamer.codehaus.org/paranamer) @@ -267,17 +937,13 @@ The text of each license is also included at licenses/LICENSE-[project].txt. (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net) (The New BSD License) Py4J (net.sf.py4j:py4j:0.9 - http://py4j.sourceforge.net/) (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/) - (BSD licence) sbt and sbt-launch-lib.bash - (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE) - (BSD 3 Clause) DPark (https://github.com/douban/dpark/blob/master/LICENSE) - (BSD 3 Clause) CloudPickle (https://github.com/cloudpipe/cloudpickle/blob/master/LICENSE) + (ISC/BSD License) jbcrypt (org.mindrot:jbcrypt:0.3m - http://www.mindrot.org/) ======================================================================== MIT licenses ======================================================================== The following components are provided under the MIT License. See project link for details. -The text of each license is also included at licenses/LICENSE-[project].txt. (MIT License) JCL 1.1.1 implemented over SLF4J (org.slf4j:jcl-over-slf4j:1.7.5 - http://www.slf4j.org) (MIT License) JUL to SLF4J bridge (org.slf4j:jul-to-slf4j:1.7.5 - http://www.slf4j.org) @@ -288,7 +954,3 @@ The text of each license is also included at licenses/LICENSE-[project].txt. (The MIT License) Mockito (org.mockito:mockito-core:1.9.5 - http://www.mockito.org) (MIT License) jquery (https://jquery.org/license/) (MIT License) AnchorJS (https://github.com/bryanbraun/anchorjs) - (MIT License) graphlib-dot (https://github.com/cpettitt/graphlib-dot) - (MIT License) dagre-d3 (https://github.com/cpettitt/dagre-d3) - (MIT License) sorttable (https://github.com/stuartlangridge/sorttable) - (MIT License) boto (https://github.com/boto/boto/blob/develop/LICENSE) diff --git a/NOTICE b/NOTICE index 7f7769f73047f..452aef2871652 100644 --- a/NOTICE +++ b/NOTICE @@ -572,38 +572,3 @@ Copyright 2009-2013 The Apache Software Foundation Apache Avro IPC Copyright 2009-2013 The Apache Software Foundation - - -Vis.js -Copyright 2010-2015 Almende B.V. - -Vis.js is dual licensed under both - - * The Apache 2.0 License - http://www.apache.org/licenses/LICENSE-2.0 - - and - - * The MIT License - http://opensource.org/licenses/MIT - -Vis.js may be distributed under either license. - - -Vis.js uses and redistributes the following third-party libraries: - -- component-emitter - https://github.com/component/emitter - The MIT License - -- hammer.js - http://hammerjs.github.io/ - The MIT License - -- moment.js - http://momentjs.com/ - The MIT License - -- keycharm - https://github.com/AlexDM0/keycharm - The MIT License \ No newline at end of file diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 3d6edb70ec98e..369714f7b99c2 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -34,4 +34,5 @@ Collate: 'serialize.R' 'sparkR.R' 'stats.R' + 'types.R' 'utils.R' diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 56b8ed0bf271b..d4bf6c868bb46 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -23,9 +23,11 @@ export("setJobGroup", exportClasses("DataFrame") exportMethods("arrange", - "attach", + "as.data.frame", + "attach", "cache", "collect", + "coltypes", "columns", "count", "cov", @@ -264,4 +266,3 @@ export("structField", "structType.structField", "print.structType") -export("as.data.frame") diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 44ce9414da5cf..17e3c096ec168 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2152,3 +2152,58 @@ setMethod("with", newEnv <- assignNewEnv(data) eval(substitute(expr), envir = newEnv, enclos = newEnv) }) + +#' Returns the column types of a DataFrame. +#' +#' @name coltypes +#' @title Get column types of a DataFrame +#' @param x (DataFrame) +#' @return value (character) A character vector with the column types of the given DataFrame +#' @rdname coltypes +setMethod("coltypes", + signature(x = "DataFrame"), + function(x) { + # TODO: This may be moved as a global parameter + # These are the supported data types and how they map to + # R's data types + DATA_TYPES <- c("string"="character", + "long"="integer", + "tinyint"="integer", + "short"="integer", + "integer"="integer", + "byte"="integer", + "double"="numeric", + "float"="numeric", + "decimal"="numeric", + "boolean"="logical" + ) + + # Get the data types of the DataFrame by invoking dtypes() function + types <- sapply(dtypes(x), function(x) {x[[2]]}) + + # Map Spark data types into R's data types using DATA_TYPES environment + rTypes <- sapply(types, USE.NAMES=F, FUN=function(x) { + + # Check for primitive types + type <- PRIMITIVE_TYPES[[x]] + if (is.null(type)) { + # Check for complex types + typeName <- Filter(function(t) { substring(x, 1, nchar(t)) == t}, + names(COMPLEX_TYPES)) + if (length(typeName) > 0) { + type <- COMPLEX_TYPES[[typeName]] + } else { + stop(paste("Unsupported data type: ", x)) + } + } + type + }) + + # Find which types don't have mapping to R + naIndices <- which(is.na(rTypes)) + + # Assign the original scala data types to the unmatched ones + rTypes[naIndices] <- types[naIndices] + + rTypes + }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 083d37fee28a4..7159c6093926b 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1027,7 +1027,6 @@ setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") }) #' @export setGeneric("year", function(x) { standardGeneric("year") }) - #' @rdname glm #' @export setGeneric("glm") @@ -1047,3 +1046,7 @@ setGeneric("attach") #' @rdname with #' @export setGeneric("with") + +#' @rdname coltypes +#' @export +setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index b0d73dd93a79d..78d63b92946ef 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -27,7 +27,7 @@ setClass("PipelineModel", representation(model = "jobj")) #' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package. #' #' @param formula A symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', '.', ':', '+', and '-'. +#' operators are supported, including '~', '+', '-', and '.'. #' @param data DataFrame for training #' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg. #' @param lambda Regularization parameter @@ -41,8 +41,7 @@ setClass("PipelineModel", representation(model = "jobj")) #' sqlContext <- sparkRSQL.init(sc) #' data(iris) #' df <- createDataFrame(sqlContext, iris) -#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian") -#' summary(model) +#' model <- glm(Sepal_Length ~ Sepal_Width, df) #'} setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"), function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0, diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R index 6f0e9a94e9bfa..12093da1baa10 100644 --- a/R/pkg/R/schema.R +++ b/R/pkg/R/schema.R @@ -115,20 +115,7 @@ structField.jobj <- function(x) { } checkType <- function(type) { - primtiveTypes <- c("byte", - "integer", - "float", - "double", - "numeric", - "character", - "string", - "binary", - "raw", - "logical", - "boolean", - "timestamp", - "date") - if (type %in% primtiveTypes) { + if (type %in% names(PRIMITIVE_TYPES)) { return() } else { # Check complex types diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R new file mode 100644 index 0000000000000..1828c23ab0f6d --- /dev/null +++ b/R/pkg/R/types.R @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# types.R. This file handles the data type mapping between Spark and R + +# The primitive data types, where names(PRIMITIVE_TYPES) are Scala types whereas +# values are equivalent R types. This is stored in an environment to allow for +# more efficient look up (environments use hashmaps). +PRIMITIVE_TYPES <- as.environment(list( + "byte"="integer", + "tinyint"="integer", + "smallint"="integer", + "integer"="integer", + "bigint"="numeric", + "float"="numeric", + "double"="numeric", + "decimal"="numeric", + "string"="character", + "binary"="raw", + "boolean"="logical", + "timestamp"="POSIXct", + "date"="Date")) + +# The complex data types. These do not have any direct mapping to R's types. +COMPLEX_TYPES <- list( + "map"=NA, + "array"=NA, + "struct"=NA) + +# The full list of data types. +DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES)) diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R index 4761e285a2479..4203f7ff5138c 100644 --- a/R/pkg/inst/tests/test_mllib.R +++ b/R/pkg/inst/tests/test_mllib.R @@ -61,14 +61,6 @@ test_that("dot minus and intercept vs native glm", { expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) }) -test_that("feature interaction vs native glm", { - training <- createDataFrame(sqlContext, iris) - model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) -}) - test_that("summary coefficients match with native glm", { training <- createDataFrame(sqlContext, iris) stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training, solver = "l-bfgs")) @@ -77,7 +69,7 @@ test_that("summary coefficients match with native glm", { expect_true(all(abs(rCoefs - coefs) < 1e-6)) expect_true(all( as.character(stats$features) == - c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica"))) + c("(Intercept)", "Sepal_Length", "Species__versicolor", "Species__virginica"))) }) test_that("summary coefficients match with native glm of family 'binomial'", { diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 92cff1fba7193..81bba149a24fe 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -695,13 +695,6 @@ test_that("select with column", { expect_equal(columns(df3), c("x")) expect_equal(count(df3), 3) expect_equal(collect(select(df3, "x"))[[1, 1]], "x") - - df4 <- select(df, c("name", "age")) - expect_equal(columns(df4), c("name", "age")) - expect_equal(count(df4), 3) - - expect_error(select(df, c("name", "age"), "name"), - "To select multiple columns, use a character vector or list for col") }) test_that("subsetting", { @@ -1467,8 +1460,9 @@ test_that("SQL error message is returned from JVM", { expect_equal(grepl("Table not found: blah", retError), TRUE) }) +irisDF <- createDataFrame(sqlContext, iris) + test_that("Method as.data.frame as a synonym for collect()", { - irisDF <- createDataFrame(sqlContext, iris) expect_equal(as.data.frame(irisDF), collect(irisDF)) irisDF2 <- irisDF[irisDF$Species == "setosa", ] expect_equal(as.data.frame(irisDF2), collect(irisDF2)) @@ -1503,6 +1497,27 @@ test_that("with() on a DataFrame", { expect_equal(nrow(sum2), 35) }) +test_that("Method coltypes() to get R's data types of a DataFrame", { + expect_equal(coltypes(irisDF), c(rep("numeric", 4), "character")) + + data <- data.frame(c1=c(1,2,3), + c2=c(T,F,T), + c3=c("2015/01/01 10:00:00", "2015/01/02 10:00:00", "2015/01/03 10:00:00")) + + schema <- structType(structField("c1", "byte"), + structField("c3", "boolean"), + structField("c4", "timestamp")) + + # Test primitive types + DF <- createDataFrame(sqlContext, data, schema) + expect_equal(coltypes(DF), c("integer", "logical", "POSIXct")) + + # Test complex types + x <- createDataFrame(sqlContext, list(list(as.environment( + list("a"="b", "c"="d", "e"="f"))))) + expect_equal(coltypes(x), "map") +}) + unlink(parquetPath) unlink(jsonPath) unlink(jsonPathNa) diff --git a/README.md b/README.md index c0d6a946035a9..f0276ae9c74d7 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ will run the Pi example locally. You can set the MASTER environment variable when running examples to submit examples to a cluster. This can be a mesos:// or spark:// URL, -"yarn" to run on YARN, and "local" to run +"yarn-cluster" or "yarn-client" to run on YARN, and "local" to run locally with one thread, or "local[N]" to run locally with N threads. You can also use an abbreviated class name if the class is in the `examples` package. For instance: diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java index ee82d679935c0..cb67d05381765 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java @@ -192,7 +192,7 @@ private long[] writePartitionedFile(File outputFile) throws IOException { } finally { Closeables.close(in, copyThrewException); } - if (!partitionWriters[i].fileSegment().file().delete()) { + if (!blockManager.diskBlockManager().getFile(partitionWriters[i].blockId()).delete()) { logger.error("Unable to delete file for partition {}", i); } } diff --git a/core/src/main/java/org/apache/spark/util/collection/TimSort.java b/core/src/main/java/org/apache/spark/util/collection/TimSort.java index 40b5fb7fe4b49..a90cc0e761f62 100644 --- a/core/src/main/java/org/apache/spark/util/collection/TimSort.java +++ b/core/src/main/java/org/apache/spark/util/collection/TimSort.java @@ -15,24 +15,6 @@ * limitations under the License. */ -/* - * Based on TimSort.java from the Android Open Source Project - * - * Copyright (C) 2008 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - package org.apache.spark.util.collection; import java.util.Comparator; diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java index d2bf297c6c178..71b76d5ddfaa7 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java @@ -21,7 +21,6 @@ import org.apache.spark.annotation.Private; import org.apache.spark.unsafe.Platform; -import org.apache.spark.unsafe.types.ByteArray; import org.apache.spark.unsafe.types.UTF8String; import org.apache.spark.util.Utils; @@ -63,7 +62,21 @@ public int compare(long aPrefix, long bPrefix) { } public static long computePrefix(byte[] bytes) { - return ByteArray.getPrefix(bytes); + if (bytes == null) { + return 0L; + } else { + /** + * TODO: If a wrapper for BinaryType is created (SPARK-8786), + * these codes below will be in the wrapper class. + */ + final int minLen = Math.min(bytes.length, 8); + long p = 0; + for (int i = 0; i < minLen; ++i) { + p |= (128L + Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i)) + << (56 - 8 * i); + } + return p; + } } } diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala index 1f1f0b75de5f1..ee60d697d8799 100644 --- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala +++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala @@ -20,7 +20,6 @@ package org.apache.spark import java.util.concurrent.{ScheduledFuture, TimeUnit} import scala.collection.mutable -import scala.concurrent.Future import org.apache.spark.executor.TaskMetrics import org.apache.spark.rpc.{ThreadSafeRpcEndpoint, RpcEnv, RpcCallContext} @@ -148,31 +147,11 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock) } } - /** - * Send ExecutorRegistered to the event loop to add a new executor. Only for test. - * - * @return if HeartbeatReceiver is stopped, return None. Otherwise, return a Some(Future) that - * indicate if this operation is successful. - */ - def addExecutor(executorId: String): Option[Future[Boolean]] = { - Option(self).map(_.ask[Boolean](ExecutorRegistered(executorId))) - } - /** * If the heartbeat receiver is not stopped, notify it of executor registrations. */ override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = { - addExecutor(executorAdded.executorId) - } - - /** - * Send ExecutorRemoved to the event loop to remove a executor. Only for test. - * - * @return if HeartbeatReceiver is stopped, return None. Otherwise, return a Some(Future) that - * indicate if this operation is successful. - */ - def removeExecutor(executorId: String): Option[Future[Boolean]] = { - Option(self).map(_.ask[Boolean](ExecutorRemoved(executorId))) + Option(self).foreach(_.ask[Boolean](ExecutorRegistered(executorAdded.executorId))) } /** @@ -186,7 +165,7 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock) * and expire it with loud error messages. */ override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = { - removeExecutor(executorRemoved.executorId) + Option(self).foreach(_.ask[Boolean](ExecutorRemoved(executorRemoved.executorId))) } private def expireDeadHosts(): Unit = { diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 72355cdfa68b3..9e847afa47c1e 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -45,7 +45,7 @@ private[spark] class MapOutputTrackerMasterEndpoint( override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case GetMapOutputStatuses(shuffleId: Int) => - val hostPort = context.senderAddress.hostPort + val hostPort = context.sender.address.hostPort logInfo("Asked to send map output locations for shuffle " + shuffleId + " to " + hostPort) val mapOutputStatuses = tracker.getSerializedMapOutputStatuses(shuffleId) val serializedSize = mapOutputStatuses.length @@ -134,25 +134,11 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging */ def getMapSizesByExecutorId(shuffleId: Int, reduceId: Int) : Seq[(BlockManagerId, Seq[(BlockId, Long)])] = { - getMapSizesByExecutorId(shuffleId, reduceId, reduceId + 1) - } - - /** - * Called from executors to get the server URIs and output sizes for each shuffle block that - * needs to be read from a given range of map output partitions (startPartition is included but - * endPartition is excluded from the range). - * - * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId, - * and the second item is a sequence of (shuffle block id, shuffle block size) tuples - * describing the shuffle blocks that are stored at that block manager. - */ - def getMapSizesByExecutorId(shuffleId: Int, startPartition: Int, endPartition: Int) - : Seq[(BlockManagerId, Seq[(BlockId, Long)])] = { - logDebug(s"Fetching outputs for shuffle $shuffleId, partitions $startPartition-$endPartition") + logDebug(s"Fetching outputs for shuffle $shuffleId, reduce $reduceId") val statuses = getStatuses(shuffleId) // Synchronize on the returned array because, on the driver, it gets mutated in place statuses.synchronized { - return MapOutputTracker.convertMapStatuses(shuffleId, startPartition, endPartition, statuses) + return MapOutputTracker.convertMapStatuses(shuffleId, reduceId, statuses) } } @@ -276,21 +262,6 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf) /** Cache a serialized version of the output statuses for each shuffle to send them out faster */ private var cacheEpoch = epoch - /** Whether to compute locality preferences for reduce tasks */ - private val shuffleLocalityEnabled = conf.getBoolean("spark.shuffle.reduceLocality.enabled", true) - - // Number of map and reduce tasks above which we do not assign preferred locations based on map - // output sizes. We limit the size of jobs for which assign preferred locations as computing the - // top locations by size becomes expensive. - private val SHUFFLE_PREF_MAP_THRESHOLD = 1000 - // NOTE: This should be less than 2000 as we use HighlyCompressedMapStatus beyond that - private val SHUFFLE_PREF_REDUCE_THRESHOLD = 1000 - - // Fraction of total map output that must be at a location for it to considered as a preferred - // location for a reduce task. Making this larger will focus on fewer locations where most data - // can be read locally, but may lead to more delay in scheduling if those locations are busy. - private val REDUCER_PREF_LOCS_FRACTION = 0.2 - /** * Timestamp based HashMap for storing mapStatuses and cached serialized statuses in the driver, * so that statuses are dropped only by explicit de-registering or by TTL-based cleaning (if set). @@ -351,30 +322,6 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf) cachedSerializedStatuses.contains(shuffleId) || mapStatuses.contains(shuffleId) } - /** - * Return the preferred hosts on which to run the given map output partition in a given shuffle, - * i.e. the nodes that the most outputs for that partition are on. - * - * @param dep shuffle dependency object - * @param partitionId map output partition that we want to read - * @return a sequence of host names - */ - def getPreferredLocationsForShuffle(dep: ShuffleDependency[_, _, _], partitionId: Int) - : Seq[String] = { - if (shuffleLocalityEnabled && dep.rdd.partitions.length < SHUFFLE_PREF_MAP_THRESHOLD && - dep.partitioner.numPartitions < SHUFFLE_PREF_REDUCE_THRESHOLD) { - val blockManagerIds = getLocationsWithLargestOutputs(dep.shuffleId, partitionId, - dep.partitioner.numPartitions, REDUCER_PREF_LOCS_FRACTION) - if (blockManagerIds.nonEmpty) { - blockManagerIds.get.map(_.host) - } else { - Nil - } - } else { - Nil - } - } - /** * Return a list of locations that each have fraction of map output greater than the specified * threshold. @@ -513,25 +460,23 @@ private[spark] object MapOutputTracker extends Logging { } /** - * Given an array of map statuses and a range of map output partitions, returns a sequence that, - * for each block manager ID, lists the shuffle block IDs and corresponding shuffle block sizes - * stored at that block manager. + * Converts an array of MapStatuses for a given reduce ID to a sequence that, for each block + * manager ID, lists the shuffle block ids and corresponding shuffle block sizes stored at that + * block manager. * * If any of the statuses is null (indicating a missing location due to a failed mapper), * throws a FetchFailedException. * * @param shuffleId Identifier for the shuffle - * @param startPartition Start of map output partition ID range (included in range) - * @param endPartition End of map output partition ID range (excluded from range) + * @param reduceId Identifier for the reduce task * @param statuses List of map statuses, indexed by map ID. * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId, - * and the second item is a sequence of (shuffle block ID, shuffle block size) tuples + * and the second item is a sequence of (shuffle block id, shuffle block size) tuples * describing the shuffle blocks that are stored at that block manager. */ private def convertMapStatuses( shuffleId: Int, - startPartition: Int, - endPartition: Int, + reduceId: Int, statuses: Array[MapStatus]): Seq[(BlockManagerId, Seq[(BlockId, Long)])] = { assert (statuses != null) val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(BlockId, Long)]] @@ -539,12 +484,10 @@ private[spark] object MapOutputTracker extends Logging { if (status == null) { val errorMessage = s"Missing an output location for shuffle $shuffleId" logError(errorMessage) - throw new MetadataFetchFailedException(shuffleId, startPartition, errorMessage) + throw new MetadataFetchFailedException(shuffleId, reduceId, errorMessage) } else { - for (part <- startPartition until endPartition) { - splitsByAddress.getOrElseUpdate(status.location, ArrayBuffer()) += - ((ShuffleBlockId(shuffleId, mapId, part), status.getSizeForBlock(part))) - } + splitsByAddress.getOrElseUpdate(status.location, ArrayBuffer()) += + ((ShuffleBlockId(shuffleId, mapId, reduceId), status.getSizeForBlock(reduceId))) } } diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 7421821e2601b..ddb8aa0c1bbf1 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1740,8 +1740,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli } SparkEnv.set(null) } - // Unset YARN mode system env variable, to allow switching between cluster types. - System.clearProperty("SPARK_YARN_MODE") SparkContext.clearActiveContext() logInfo("Successfully stopped SparkContext") } diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala index 23ae9360f6a22..8f0721a4c6f93 100644 --- a/core/src/main/scala/org/apache/spark/SparkEnv.scala +++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala @@ -20,10 +20,11 @@ package org.apache.spark import java.io.File import java.net.Socket +import akka.actor.ActorSystem + import scala.collection.mutable import scala.util.Properties -import akka.actor.ActorSystem import com.google.common.collect.MapMaker import org.apache.spark.annotation.DeveloperApi @@ -56,7 +57,6 @@ import org.apache.spark.util.{AkkaUtils, RpcUtils, Utils} class SparkEnv ( val executorId: String, private[spark] val rpcEnv: RpcEnv, - _actorSystem: ActorSystem, // TODO Remove actorSystem val serializer: Serializer, val closureSerializer: Serializer, val cacheManager: CacheManager, @@ -75,7 +75,7 @@ class SparkEnv ( // TODO Remove actorSystem @deprecated("Actor system is no longer supported as of 1.4.0", "1.4.0") - val actorSystem: ActorSystem = _actorSystem + val actorSystem: ActorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem private[spark] var isStopped = false private val pythonWorkers = mutable.HashMap[(String, Map[String, String]), PythonWorkerFactory]() @@ -99,9 +99,6 @@ class SparkEnv ( blockManager.master.stop() metricsSystem.stop() outputCommitCoordinator.stop() - if (!rpcEnv.isInstanceOf[AkkaRpcEnv]) { - actorSystem.shutdown() - } rpcEnv.shutdown() // Unfortunately Akka's awaitTermination doesn't actually wait for the Netty server to shut @@ -405,7 +402,6 @@ object SparkEnv extends Logging { val envInstance = new SparkEnv( executorId, rpcEnv, - actorSystem, serializer, closureSerializer, cacheManager, diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 8464b578ed09e..19be0939038b8 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -633,7 +633,7 @@ private[spark] object PythonRDD extends Logging { * * The thread will terminate after all the data are sent or any exceptions happen. */ - def serveIterator[T](items: Iterator[T], threadName: String): Int = { + private def serveIterator[T](items: Iterator[T], threadName: String): Int = { val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost")) // Close the socket if no connection in 3 seconds serverSocket.setSoTimeout(3000) diff --git a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala index ed183cf16a9cb..6e5982712766c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala @@ -25,7 +25,6 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.spark.api.r.{RBackend, RUtils} -import org.apache.spark.{SparkException, SparkUserAppException} import org.apache.spark.util.RedirectThread /** @@ -94,15 +93,12 @@ object RRunner { } finally { sparkRBackend.close() } - if (returnCode != 0) { - throw new SparkUserAppException(returnCode) - } + System.exit(returnCode) } else { - val errorMessage = s"SparkR backend did not initialize in $backendTimeout seconds" // scalastyle:off println - System.err.println(errorMessage) + System.err.println("SparkR backend did not initialize in " + backendTimeout + " seconds") // scalastyle:on println - throw new SparkException(errorMessage) + System.exit(-1) } } } diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index d606b80c03c98..a0b7365df900a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -385,13 +385,20 @@ class SparkHadoopUtil extends Logging { object SparkHadoopUtil { - private lazy val hadoop = new SparkHadoopUtil - private lazy val yarn = try { - Utils.classForName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil") - .newInstance() - .asInstanceOf[SparkHadoopUtil] - } catch { - case e: Exception => throw new SparkException("Unable to load YARN support", e) + private val hadoop = { + val yarnMode = java.lang.Boolean.valueOf( + System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE"))) + if (yarnMode) { + try { + Utils.classForName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil") + .newInstance() + .asInstanceOf[SparkHadoopUtil] + } catch { + case e: Exception => throw new SparkException("Unable to load YARN support", e) + } + } else { + new SparkHadoopUtil + } } val SPARK_YARN_CREDS_TEMP_EXTENSION = ".tmp" @@ -399,13 +406,6 @@ object SparkHadoopUtil { val SPARK_YARN_CREDS_COUNTER_DELIM = "-" def get: SparkHadoopUtil = { - // Check each time to support changing to/from YARN - val yarnMode = java.lang.Boolean.valueOf( - System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE"))) - if (yarnMode) { - yarn - } else { - hadoop - } + hadoop } } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala index d03bab3820bb2..18265df9faa2c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala @@ -30,35 +30,28 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin parse(args.toList) private def parse(args: List[String]): Unit = { - if (args.length == 1) { - setLogDirectory(args.head) - } else { - args match { - case ("--dir" | "-d") :: value :: tail => - setLogDirectory(value) - parse(tail) + args match { + case ("--dir" | "-d") :: value :: tail => + logWarning("Setting log directory through the command line is deprecated as of " + + "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.") + conf.set("spark.history.fs.logDirectory", value) + System.setProperty("spark.history.fs.logDirectory", value) + parse(tail) - case ("--help" | "-h") :: tail => - printUsageAndExit(0) + case ("--help" | "-h") :: tail => + printUsageAndExit(0) - case ("--properties-file") :: value :: tail => - propertiesFile = value - parse(tail) + case ("--properties-file") :: value :: tail => + propertiesFile = value + parse(tail) - case Nil => + case Nil => - case _ => - printUsageAndExit(1) - } + case _ => + printUsageAndExit(1) } } - private def setLogDirectory(value: String): Unit = { - logWarning("Setting log directory through the command line is deprecated as of " + - "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.") - conf.set("spark.history.fs.logDirectory", value) - } - // This mutates the SparkConf, so all accesses to it must be made after this line Utils.loadDefaultSparkProperties(conf, propertiesFile) @@ -69,8 +62,6 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin |Usage: HistoryServer [options] | |Options: - | DIR Deprecated; set spark.history.fs.logDirectory directly - | --dir DIR (-d DIR) Deprecated; set spark.history.fs.logDirectory directly | --properties-file FILE Path to a custom Spark properties file. | Default is conf/spark-defaults.conf. | @@ -99,4 +90,3 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin } } - diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index a45867e7680ec..865e011ff383d 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -327,7 +327,7 @@ private[deploy] class Worker( registrationRetryTimer = Some(forwordMessageScheduler.scheduleAtFixedRate( new Runnable { override def run(): Unit = Utils.tryLogNonFatalError { - Option(self).foreach(_.send(ReregisterWithMaster)) + self.send(ReregisterWithMaster) } }, INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS, diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala index ab56fde938bae..735c4f0927150 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala @@ -24,13 +24,14 @@ import org.apache.spark.rpc._ * Actor which connects to a worker process and terminates the JVM if the connection is severed. * Provides fate sharing between a worker and its associated child processes. */ -private[spark] class WorkerWatcher( - override val rpcEnv: RpcEnv, workerUrl: String, isTesting: Boolean = false) +private[spark] class WorkerWatcher(override val rpcEnv: RpcEnv, workerUrl: String) extends RpcEndpoint with Logging { - logInfo(s"Connecting to worker $workerUrl") - if (!isTesting) { - rpcEnv.asyncSetupEndpointRefByURI(workerUrl) + override def onStart() { + logInfo(s"Connecting to worker $workerUrl") + if (!isTesting) { + rpcEnv.asyncSetupEndpointRefByURI(workerUrl) + } } // Used to avoid shutting down JVM during tests @@ -39,6 +40,8 @@ private[spark] class WorkerWatcher( // true rather than calling `System.exit`. The user can check `isShutDown` to know if // `exitNonZero` is called. private[deploy] var isShutDown = false + private[deploy] def setTesting(testing: Boolean) = isTesting = testing + private var isTesting = false // Lets filter events only from the worker's rpc system private val expectedAddress = RpcAddress.fromURIString(workerUrl) diff --git a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala index a013c3f66a3a8..cb15d912bbfb5 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala @@ -86,12 +86,6 @@ class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag]( Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } - override def getPreferredLocations(partition: Partition): Seq[String] = { - val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] - val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] - tracker.getPreferredLocationsForShuffle(dep, partition.index) - } - override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala index f527ec86ab7b2..3e5b64265e919 100644 --- a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala +++ b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala @@ -37,5 +37,5 @@ private[spark] trait RpcCallContext { /** * The sender of this message. */ - def senderAddress: RpcAddress + def sender: RpcEndpointRef } diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala index 0ba95169529e6..5d8bd1907caa5 100644 --- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala @@ -28,6 +28,20 @@ private[spark] trait RpcEnvFactory { def create(config: RpcEnvConfig): RpcEnv } +/** + * A trait that requires RpcEnv thread-safely sending messages to it. + * + * Thread-safety means processing of one message happens before processing of the next message by + * the same [[ThreadSafeRpcEndpoint]]. In the other words, changes to internal fields of a + * [[ThreadSafeRpcEndpoint]] are visible when processing the next message, and fields in the + * [[ThreadSafeRpcEndpoint]] need not be volatile or equivalent. + * + * However, there is no guarantee that the same thread will be executing the same + * [[ThreadSafeRpcEndpoint]] for different messages. + */ +private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint + + /** * An end point for the RPC that defines what functions to trigger given a message. * @@ -87,39 +101,38 @@ private[spark] trait RpcEndpoint { } /** - * Invoked when `remoteAddress` is connected to the current node. + * Invoked before [[RpcEndpoint]] starts to handle any message. */ - def onConnected(remoteAddress: RpcAddress): Unit = { + def onStart(): Unit = { // By default, do nothing. } /** - * Invoked when `remoteAddress` is lost. + * Invoked when [[RpcEndpoint]] is stopping. */ - def onDisconnected(remoteAddress: RpcAddress): Unit = { + def onStop(): Unit = { // By default, do nothing. } /** - * Invoked when some network error happens in the connection between the current node and - * `remoteAddress`. + * Invoked when `remoteAddress` is connected to the current node. */ - def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = { + def onConnected(remoteAddress: RpcAddress): Unit = { // By default, do nothing. } /** - * Invoked before [[RpcEndpoint]] starts to handle any message. + * Invoked when `remoteAddress` is lost. */ - def onStart(): Unit = { + def onDisconnected(remoteAddress: RpcAddress): Unit = { // By default, do nothing. } /** - * Invoked when [[RpcEndpoint]] is stopping. `self` will be `null` in this method and you cannot - * use it to send or ask messages. + * Invoked when some network error happens in the connection between the current node and + * `remoteAddress`. */ - def onStop(): Unit = { + def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = { // By default, do nothing. } diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala deleted file mode 100644 index d177881fb3053..0000000000000 --- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.rpc - -import org.apache.spark.SparkException - -private[rpc] class RpcEndpointNotFoundException(uri: String) - extends SparkException(s"Cannot find endpoint: $uri") diff --git a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala index 3fad595a0d0b0..0eda24d541289 100644 --- a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala +++ b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala @@ -162,9 +162,9 @@ private[spark] class AkkaRpcEnv private[akka] ( _sender ! AkkaMessage(response, false) } - // Use "lazy" because most of RpcEndpoints don't need "senderAddress" - override lazy val senderAddress: RpcAddress = - new AkkaRpcEndpointRef(defaultAddress, _sender, conf).address + // Some RpcEndpoints need to know the sender's address + override val sender: RpcEndpointRef = + new AkkaRpcEndpointRef(defaultAddress, _sender, conf) }) } else { endpoint.receive diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala deleted file mode 100644 index eb25d6c7b721b..0000000000000 --- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc.netty - -import java.util.concurrent.{ThreadPoolExecutor, ConcurrentHashMap, LinkedBlockingQueue, TimeUnit} -import javax.annotation.concurrent.GuardedBy - -import scala.collection.JavaConverters._ -import scala.concurrent.Promise -import scala.util.control.NonFatal - -import org.apache.spark.{SparkException, Logging} -import org.apache.spark.network.client.RpcResponseCallback -import org.apache.spark.rpc._ -import org.apache.spark.util.ThreadUtils - -/** - * A message dispatcher, responsible for routing RPC messages to the appropriate endpoint(s). - */ -private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging { - - private class EndpointData( - val name: String, - val endpoint: RpcEndpoint, - val ref: NettyRpcEndpointRef) { - val inbox = new Inbox(ref, endpoint) - } - - private val endpoints = new ConcurrentHashMap[String, EndpointData] - private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef] - - // Track the receivers whose inboxes may contain messages. - private val receivers = new LinkedBlockingQueue[EndpointData] - - /** - * True if the dispatcher has been stopped. Once stopped, all messages posted will be bounced - * immediately. - */ - @GuardedBy("this") - private var stopped = false - - def registerRpcEndpoint(name: String, endpoint: RpcEndpoint): NettyRpcEndpointRef = { - val addr = RpcEndpointAddress(nettyEnv.address, name) - val endpointRef = new NettyRpcEndpointRef(nettyEnv.conf, addr, nettyEnv) - synchronized { - if (stopped) { - throw new IllegalStateException("RpcEnv has been stopped") - } - if (endpoints.putIfAbsent(name, new EndpointData(name, endpoint, endpointRef)) != null) { - throw new IllegalArgumentException(s"There is already an RpcEndpoint called $name") - } - val data = endpoints.get(name) - endpointRefs.put(data.endpoint, data.ref) - receivers.offer(data) // for the OnStart message - } - endpointRef - } - - def getRpcEndpointRef(endpoint: RpcEndpoint): RpcEndpointRef = endpointRefs.get(endpoint) - - def removeRpcEndpointRef(endpoint: RpcEndpoint): Unit = endpointRefs.remove(endpoint) - - // Should be idempotent - private def unregisterRpcEndpoint(name: String): Unit = { - val data = endpoints.remove(name) - if (data != null) { - data.inbox.stop() - receivers.offer(data) // for the OnStop message - } - // Don't clean `endpointRefs` here because it's possible that some messages are being processed - // now and they can use `getRpcEndpointRef`. So `endpointRefs` will be cleaned in Inbox via - // `removeRpcEndpointRef`. - } - - def stop(rpcEndpointRef: RpcEndpointRef): Unit = { - synchronized { - if (stopped) { - // This endpoint will be stopped by Dispatcher.stop() method. - return - } - unregisterRpcEndpoint(rpcEndpointRef.name) - } - } - - /** - * Send a message to all registered [[RpcEndpoint]]s in this process. - * - * This can be used to make network events known to all end points (e.g. "a new node connected"). - */ - def postToAll(message: InboxMessage): Unit = { - val iter = endpoints.keySet().iterator() - while (iter.hasNext) { - val name = iter.next - postMessage( - name, - _ => message, - () => { logWarning(s"Drop $message because $name has been stopped") }) - } - } - - /** Posts a message sent by a remote endpoint. */ - def postRemoteMessage(message: RequestMessage, callback: RpcResponseCallback): Unit = { - def createMessage(sender: NettyRpcEndpointRef): InboxMessage = { - val rpcCallContext = - new RemoteNettyRpcCallContext( - nettyEnv, sender, callback, message.senderAddress, message.needReply) - ContentMessage(message.senderAddress, message.content, message.needReply, rpcCallContext) - } - - def onEndpointStopped(): Unit = { - callback.onFailure( - new SparkException(s"Could not find ${message.receiver.name} or it has been stopped")) - } - - postMessage(message.receiver.name, createMessage, onEndpointStopped) - } - - /** Posts a message sent by a local endpoint. */ - def postLocalMessage(message: RequestMessage, p: Promise[Any]): Unit = { - def createMessage(sender: NettyRpcEndpointRef): InboxMessage = { - val rpcCallContext = - new LocalNettyRpcCallContext(sender, message.senderAddress, message.needReply, p) - ContentMessage(message.senderAddress, message.content, message.needReply, rpcCallContext) - } - - def onEndpointStopped(): Unit = { - p.tryFailure( - new SparkException(s"Could not find ${message.receiver.name} or it has been stopped")) - } - - postMessage(message.receiver.name, createMessage, onEndpointStopped) - } - - /** - * Posts a message to a specific endpoint. - * - * @param endpointName name of the endpoint. - * @param createMessageFn function to create the message. - * @param callbackIfStopped callback function if the endpoint is stopped. - */ - private def postMessage( - endpointName: String, - createMessageFn: NettyRpcEndpointRef => InboxMessage, - callbackIfStopped: () => Unit): Unit = { - val shouldCallOnStop = synchronized { - val data = endpoints.get(endpointName) - if (stopped || data == null) { - true - } else { - data.inbox.post(createMessageFn(data.ref)) - receivers.offer(data) - false - } - } - if (shouldCallOnStop) { - // We don't need to call `onStop` in the `synchronized` block - callbackIfStopped() - } - } - - def stop(): Unit = { - synchronized { - if (stopped) { - return - } - stopped = true - } - // Stop all endpoints. This will queue all endpoints for processing by the message loops. - endpoints.keySet().asScala.foreach(unregisterRpcEndpoint) - // Enqueue a message that tells the message loops to stop. - receivers.offer(PoisonPill) - threadpool.shutdown() - } - - def awaitTermination(): Unit = { - threadpool.awaitTermination(Long.MaxValue, TimeUnit.MILLISECONDS) - } - - /** - * Return if the endpoint exists - */ - def verify(name: String): Boolean = { - endpoints.containsKey(name) - } - - /** Thread pool used for dispatching messages. */ - private val threadpool: ThreadPoolExecutor = { - val numThreads = nettyEnv.conf.getInt("spark.rpc.netty.dispatcher.numThreads", - Runtime.getRuntime.availableProcessors()) - val pool = ThreadUtils.newDaemonFixedThreadPool(numThreads, "dispatcher-event-loop") - for (i <- 0 until numThreads) { - pool.execute(new MessageLoop) - } - pool - } - - /** Message loop used for dispatching messages. */ - private class MessageLoop extends Runnable { - override def run(): Unit = { - try { - while (true) { - try { - val data = receivers.take() - if (data == PoisonPill) { - // Put PoisonPill back so that other MessageLoops can see it. - receivers.offer(PoisonPill) - return - } - data.inbox.process(Dispatcher.this) - } catch { - case NonFatal(e) => logError(e.getMessage, e) - } - } - } catch { - case ie: InterruptedException => // exit - } - } - } - - /** A poison endpoint that indicates MessageLoop should exit its message loop. */ - private val PoisonPill = new EndpointData(null, null, null) -} diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala deleted file mode 100644 index c72b588db57fe..0000000000000 --- a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc.netty - -import javax.annotation.concurrent.GuardedBy - -import scala.util.control.NonFatal - -import com.google.common.annotations.VisibleForTesting - -import org.apache.spark.{Logging, SparkException} -import org.apache.spark.rpc.{RpcAddress, RpcEndpoint, ThreadSafeRpcEndpoint} - - -private[netty] sealed trait InboxMessage - -private[netty] case class ContentMessage( - senderAddress: RpcAddress, - content: Any, - needReply: Boolean, - context: NettyRpcCallContext) extends InboxMessage - -private[netty] case object OnStart extends InboxMessage - -private[netty] case object OnStop extends InboxMessage - -/** A message to tell all endpoints that a remote process has connected. */ -private[netty] case class RemoteProcessConnected(remoteAddress: RpcAddress) extends InboxMessage - -/** A message to tell all endpoints that a remote process has disconnected. */ -private[netty] case class RemoteProcessDisconnected(remoteAddress: RpcAddress) extends InboxMessage - -/** A message to tell all endpoints that a network error has happened. */ -private[netty] case class RemoteProcessConnectionError(cause: Throwable, remoteAddress: RpcAddress) - extends InboxMessage - -/** - * A inbox that stores messages for an [[RpcEndpoint]] and posts messages to it thread-safely. - */ -private[netty] class Inbox( - val endpointRef: NettyRpcEndpointRef, - val endpoint: RpcEndpoint) - extends Logging { - - inbox => // Give this an alias so we can use it more clearly in closures. - - @GuardedBy("this") - protected val messages = new java.util.LinkedList[InboxMessage]() - - /** True if the inbox (and its associated endpoint) is stopped. */ - @GuardedBy("this") - private var stopped = false - - /** Allow multiple threads to process messages at the same time. */ - @GuardedBy("this") - private var enableConcurrent = false - - /** The number of threads processing messages for this inbox. */ - @GuardedBy("this") - private var numActiveThreads = 0 - - // OnStart should be the first message to process - inbox.synchronized { - messages.add(OnStart) - } - - /** - * Process stored messages. - */ - def process(dispatcher: Dispatcher): Unit = { - var message: InboxMessage = null - inbox.synchronized { - if (!enableConcurrent && numActiveThreads != 0) { - return - } - message = messages.poll() - if (message != null) { - numActiveThreads += 1 - } else { - return - } - } - while (true) { - safelyCall(endpoint) { - message match { - case ContentMessage(_sender, content, needReply, context) => - // The partial function to call - val pf = if (needReply) endpoint.receiveAndReply(context) else endpoint.receive - try { - pf.applyOrElse[Any, Unit](content, { msg => - throw new SparkException(s"Unsupported message $message from ${_sender}") - }) - if (!needReply) { - context.finish() - } - } catch { - case NonFatal(e) => - if (needReply) { - // If the sender asks a reply, we should send the error back to the sender - context.sendFailure(e) - } else { - context.finish() - } - // Throw the exception -- this exception will be caught by the safelyCall function. - // The endpoint's onError function will be called. - throw e - } - - case OnStart => - endpoint.onStart() - if (!endpoint.isInstanceOf[ThreadSafeRpcEndpoint]) { - inbox.synchronized { - if (!stopped) { - enableConcurrent = true - } - } - } - - case OnStop => - val activeThreads = inbox.synchronized { inbox.numActiveThreads } - assert(activeThreads == 1, - s"There should be only a single active thread but found $activeThreads threads.") - dispatcher.removeRpcEndpointRef(endpoint) - endpoint.onStop() - assert(isEmpty, "OnStop should be the last message") - - case RemoteProcessConnected(remoteAddress) => - endpoint.onConnected(remoteAddress) - - case RemoteProcessDisconnected(remoteAddress) => - endpoint.onDisconnected(remoteAddress) - - case RemoteProcessConnectionError(cause, remoteAddress) => - endpoint.onNetworkError(cause, remoteAddress) - } - } - - inbox.synchronized { - // "enableConcurrent" will be set to false after `onStop` is called, so we should check it - // every time. - if (!enableConcurrent && numActiveThreads != 1) { - // If we are not the only one worker, exit - numActiveThreads -= 1 - return - } - message = messages.poll() - if (message == null) { - numActiveThreads -= 1 - return - } - } - } - } - - def post(message: InboxMessage): Unit = inbox.synchronized { - if (stopped) { - // We already put "OnStop" into "messages", so we should drop further messages - onDrop(message) - } else { - messages.add(message) - false - } - } - - def stop(): Unit = inbox.synchronized { - // The following codes should be in `synchronized` so that we can make sure "OnStop" is the last - // message - if (!stopped) { - // We should disable concurrent here. Then when RpcEndpoint.onStop is called, it's the only - // thread that is processing messages. So `RpcEndpoint.onStop` can release its resources - // safely. - enableConcurrent = false - stopped = true - messages.add(OnStop) - // Note: The concurrent events in messages will be processed one by one. - } - } - - def isEmpty: Boolean = inbox.synchronized { messages.isEmpty } - - /** Called when we are dropping a message. Test cases override this to test message dropping. */ - @VisibleForTesting - protected def onDrop(message: InboxMessage): Unit = { - logWarning(s"Drop $message because $endpointRef is stopped") - } - - /** - * Calls action closure, and calls the endpoint's onError function in the case of exceptions. - */ - private def safelyCall(endpoint: RpcEndpoint)(action: => Unit): Unit = { - try action catch { - case NonFatal(e) => - try endpoint.onError(e) catch { - case NonFatal(ee) => logError(s"Ignoring error", ee) - } - } - } - -} diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala deleted file mode 100644 index 21d5bb4923d1b..0000000000000 --- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc.netty - -import scala.concurrent.Promise - -import org.apache.spark.Logging -import org.apache.spark.network.client.RpcResponseCallback -import org.apache.spark.rpc.{RpcAddress, RpcCallContext} - -private[netty] abstract class NettyRpcCallContext( - endpointRef: NettyRpcEndpointRef, - override val senderAddress: RpcAddress, - needReply: Boolean) - extends RpcCallContext with Logging { - - protected def send(message: Any): Unit - - override def reply(response: Any): Unit = { - if (needReply) { - send(AskResponse(endpointRef, response)) - } else { - throw new IllegalStateException( - s"Cannot send $response to the sender because the sender does not expect a reply") - } - } - - override def sendFailure(e: Throwable): Unit = { - if (needReply) { - send(AskResponse(endpointRef, RpcFailure(e))) - } else { - logError(e.getMessage, e) - throw new IllegalStateException( - "Cannot send reply to the sender because the sender won't handle it") - } - } - - def finish(): Unit = { - if (!needReply) { - send(Ack(endpointRef)) - } - } -} - -/** - * If the sender and the receiver are in the same process, the reply can be sent back via `Promise`. - */ -private[netty] class LocalNettyRpcCallContext( - endpointRef: NettyRpcEndpointRef, - senderAddress: RpcAddress, - needReply: Boolean, - p: Promise[Any]) - extends NettyRpcCallContext(endpointRef, senderAddress, needReply) { - - override protected def send(message: Any): Unit = { - p.success(message) - } -} - -/** - * A [[RpcCallContext]] that will call [[RpcResponseCallback]] to send the reply back. - */ -private[netty] class RemoteNettyRpcCallContext( - nettyEnv: NettyRpcEnv, - endpointRef: NettyRpcEndpointRef, - callback: RpcResponseCallback, - senderAddress: RpcAddress, - needReply: Boolean) - extends NettyRpcCallContext(endpointRef, senderAddress, needReply) { - - override protected def send(message: Any): Unit = { - val reply = nettyEnv.serialize(message) - callback.onSuccess(reply) - } -} diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala deleted file mode 100644 index 09093819bb22c..0000000000000 --- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala +++ /dev/null @@ -1,532 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.rpc.netty - -import java.io._ -import java.lang.{Boolean => JBoolean} -import java.net.{InetSocketAddress, URI} -import java.nio.ByteBuffer -import java.util.concurrent._ -import java.util.concurrent.atomic.AtomicBoolean -import javax.annotation.Nullable; -import javax.annotation.concurrent.GuardedBy - -import scala.collection.mutable -import scala.concurrent.{Future, Promise} -import scala.reflect.ClassTag -import scala.util.{DynamicVariable, Failure, Success} -import scala.util.control.NonFatal - -import com.google.common.base.Preconditions -import org.apache.spark.{Logging, SecurityManager, SparkConf} -import org.apache.spark.network.TransportContext -import org.apache.spark.network.client._ -import org.apache.spark.network.netty.SparkTransportConf -import org.apache.spark.network.sasl.{SaslClientBootstrap, SaslServerBootstrap} -import org.apache.spark.network.server._ -import org.apache.spark.rpc._ -import org.apache.spark.serializer.{JavaSerializer, JavaSerializerInstance} -import org.apache.spark.util.{ThreadUtils, Utils} - -private[netty] class NettyRpcEnv( - val conf: SparkConf, - javaSerializerInstance: JavaSerializerInstance, - host: String, - securityManager: SecurityManager) extends RpcEnv(conf) with Logging { - - private val transportConf = SparkTransportConf.fromSparkConf( - conf.clone.set("spark.shuffle.io.numConnectionsPerPeer", "1"), - conf.getInt("spark.rpc.io.threads", 0)) - - private val dispatcher: Dispatcher = new Dispatcher(this) - - private val transportContext = new TransportContext(transportConf, - new NettyRpcHandler(dispatcher, this)) - - private val clientFactory = { - val bootstraps: java.util.List[TransportClientBootstrap] = - if (securityManager.isAuthenticationEnabled()) { - java.util.Arrays.asList(new SaslClientBootstrap(transportConf, "", securityManager, - securityManager.isSaslEncryptionEnabled())) - } else { - java.util.Collections.emptyList[TransportClientBootstrap] - } - transportContext.createClientFactory(bootstraps) - } - - val timeoutScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("netty-rpc-env-timeout") - - // Because TransportClientFactory.createClient is blocking, we need to run it in this thread pool - // to implement non-blocking send/ask. - // TODO: a non-blocking TransportClientFactory.createClient in future - private[netty] val clientConnectionExecutor = ThreadUtils.newDaemonCachedThreadPool( - "netty-rpc-connection", - conf.getInt("spark.rpc.connect.threads", 64)) - - @volatile private var server: TransportServer = _ - - private val stopped = new AtomicBoolean(false) - - /** - * A map for [[RpcAddress]] and [[Outbox]]. When we are connecting to a remote [[RpcAddress]], - * we just put messages to its [[Outbox]] to implement a non-blocking `send` method. - */ - private val outboxes = new ConcurrentHashMap[RpcAddress, Outbox]() - - /** - * Remove the address's Outbox and stop it. - */ - private[netty] def removeOutbox(address: RpcAddress): Unit = { - val outbox = outboxes.remove(address) - if (outbox != null) { - outbox.stop() - } - } - - def startServer(port: Int): Unit = { - val bootstraps: java.util.List[TransportServerBootstrap] = - if (securityManager.isAuthenticationEnabled()) { - java.util.Arrays.asList(new SaslServerBootstrap(transportConf, securityManager)) - } else { - java.util.Collections.emptyList() - } - server = transportContext.createServer(port, bootstraps) - dispatcher.registerRpcEndpoint( - RpcEndpointVerifier.NAME, new RpcEndpointVerifier(this, dispatcher)) - } - - @Nullable - override lazy val address: RpcAddress = { - if (server != null) RpcAddress(host, server.getPort()) else null - } - - override def setupEndpoint(name: String, endpoint: RpcEndpoint): RpcEndpointRef = { - dispatcher.registerRpcEndpoint(name, endpoint) - } - - def asyncSetupEndpointRefByURI(uri: String): Future[RpcEndpointRef] = { - val addr = RpcEndpointAddress(uri) - val endpointRef = new NettyRpcEndpointRef(conf, addr, this) - val verifier = new NettyRpcEndpointRef( - conf, RpcEndpointAddress(addr.rpcAddress, RpcEndpointVerifier.NAME), this) - verifier.ask[Boolean](RpcEndpointVerifier.CheckExistence(endpointRef.name)).flatMap { find => - if (find) { - Future.successful(endpointRef) - } else { - Future.failed(new RpcEndpointNotFoundException(uri)) - } - }(ThreadUtils.sameThread) - } - - override def stop(endpointRef: RpcEndpointRef): Unit = { - require(endpointRef.isInstanceOf[NettyRpcEndpointRef]) - dispatcher.stop(endpointRef) - } - - private def postToOutbox(receiver: NettyRpcEndpointRef, message: OutboxMessage): Unit = { - if (receiver.client != null) { - receiver.client.sendRpc(message.content, message.createCallback(receiver.client)); - } else { - require(receiver.address != null, - "Cannot send message to client endpoint with no listen address.") - val targetOutbox = { - val outbox = outboxes.get(receiver.address) - if (outbox == null) { - val newOutbox = new Outbox(this, receiver.address) - val oldOutbox = outboxes.putIfAbsent(receiver.address, newOutbox) - if (oldOutbox == null) { - newOutbox - } else { - oldOutbox - } - } else { - outbox - } - } - if (stopped.get) { - // It's possible that we put `targetOutbox` after stopping. So we need to clean it. - outboxes.remove(receiver.address) - targetOutbox.stop() - } else { - targetOutbox.send(message) - } - } - } - - private[netty] def send(message: RequestMessage): Unit = { - val remoteAddr = message.receiver.address - if (remoteAddr == address) { - // Message to a local RPC endpoint. - val promise = Promise[Any]() - dispatcher.postLocalMessage(message, promise) - promise.future.onComplete { - case Success(response) => - val ack = response.asInstanceOf[Ack] - logTrace(s"Received ack from ${ack.sender}") - case Failure(e) => - logWarning(s"Exception when sending $message", e) - }(ThreadUtils.sameThread) - } else { - // Message to a remote RPC endpoint. - postToOutbox(message.receiver, OutboxMessage(serialize(message), - (e) => { - logWarning(s"Exception when sending $message", e) - }, - (client, response) => { - val ack = deserialize[Ack](client, response) - logDebug(s"Receive ack from ${ack.sender}") - })) - } - } - - private[netty] def createClient(address: RpcAddress): TransportClient = { - clientFactory.createClient(address.host, address.port) - } - - private[netty] def ask(message: RequestMessage): Future[Any] = { - val promise = Promise[Any]() - val remoteAddr = message.receiver.address - if (remoteAddr == address) { - val p = Promise[Any]() - dispatcher.postLocalMessage(message, p) - p.future.onComplete { - case Success(response) => - val reply = response.asInstanceOf[AskResponse] - if (reply.reply.isInstanceOf[RpcFailure]) { - if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) { - logWarning(s"Ignore failure: ${reply.reply}") - } - } else if (!promise.trySuccess(reply.reply)) { - logWarning(s"Ignore message: ${reply}") - } - case Failure(e) => - if (!promise.tryFailure(e)) { - logWarning("Ignore Exception", e) - } - }(ThreadUtils.sameThread) - } else { - postToOutbox(message.receiver, OutboxMessage(serialize(message), - (e) => { - if (!promise.tryFailure(e)) { - logWarning("Ignore Exception", e) - } - }, - (client, response) => { - val reply = deserialize[AskResponse](client, response) - if (reply.reply.isInstanceOf[RpcFailure]) { - if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) { - logWarning(s"Ignore failure: ${reply.reply}") - } - } else if (!promise.trySuccess(reply.reply)) { - logWarning(s"Ignore message: ${reply}") - } - })) - } - promise.future - } - - private[netty] def serialize(content: Any): Array[Byte] = { - val buffer = javaSerializerInstance.serialize(content) - java.util.Arrays.copyOfRange( - buffer.array(), buffer.arrayOffset + buffer.position, buffer.arrayOffset + buffer.limit) - } - - private[netty] def deserialize[T: ClassTag](client: TransportClient, bytes: Array[Byte]): T = { - NettyRpcEnv.currentClient.withValue(client) { - deserialize { () => - javaSerializerInstance.deserialize[T](ByteBuffer.wrap(bytes)) - } - } - } - - override def endpointRef(endpoint: RpcEndpoint): RpcEndpointRef = { - dispatcher.getRpcEndpointRef(endpoint) - } - - override def uriOf(systemName: String, address: RpcAddress, endpointName: String): String = - new RpcEndpointAddress(address, endpointName).toString - - override def shutdown(): Unit = { - cleanup() - } - - override def awaitTermination(): Unit = { - dispatcher.awaitTermination() - } - - private def cleanup(): Unit = { - if (!stopped.compareAndSet(false, true)) { - return - } - - val iter = outboxes.values().iterator() - while (iter.hasNext()) { - val outbox = iter.next() - outboxes.remove(outbox.address) - outbox.stop() - } - if (timeoutScheduler != null) { - timeoutScheduler.shutdownNow() - } - if (server != null) { - server.close() - } - if (clientFactory != null) { - clientFactory.close() - } - if (dispatcher != null) { - dispatcher.stop() - } - if (clientConnectionExecutor != null) { - clientConnectionExecutor.shutdownNow() - } - } - - override def deserialize[T](deserializationAction: () => T): T = { - NettyRpcEnv.currentEnv.withValue(this) { - deserializationAction() - } - } - -} - -private[netty] object NettyRpcEnv extends Logging { - - /** - * When deserializing the [[NettyRpcEndpointRef]], it needs a reference to [[NettyRpcEnv]]. - * Use `currentEnv` to wrap the deserialization codes. E.g., - * - * {{{ - * NettyRpcEnv.currentEnv.withValue(this) { - * your deserialization codes - * } - * }}} - */ - private[netty] val currentEnv = new DynamicVariable[NettyRpcEnv](null) - - /** - * Similar to `currentEnv`, this variable references the client instance associated with an - * RPC, in case it's needed to find out the remote address during deserialization. - */ - private[netty] val currentClient = new DynamicVariable[TransportClient](null) - -} - -private[netty] class NettyRpcEnvFactory extends RpcEnvFactory with Logging { - - def create(config: RpcEnvConfig): RpcEnv = { - val sparkConf = config.conf - // Use JavaSerializerInstance in multiple threads is safe. However, if we plan to support - // KryoSerializer in future, we have to use ThreadLocal to store SerializerInstance - val javaSerializerInstance = - new JavaSerializer(sparkConf).newInstance().asInstanceOf[JavaSerializerInstance] - val nettyEnv = - new NettyRpcEnv(sparkConf, javaSerializerInstance, config.host, config.securityManager) - if (!config.clientMode) { - val startNettyRpcEnv: Int => (NettyRpcEnv, Int) = { actualPort => - nettyEnv.startServer(actualPort) - (nettyEnv, actualPort) - } - try { - Utils.startServiceOnPort(config.port, startNettyRpcEnv, sparkConf, "NettyRpcEnv")._1 - } catch { - case NonFatal(e) => - nettyEnv.shutdown() - throw e - } - } - nettyEnv - } -} - -/** - * The NettyRpcEnv version of RpcEndpointRef. - * - * This class behaves differently depending on where it's created. On the node that "owns" the - * RpcEndpoint, it's a simple wrapper around the RpcEndpointAddress instance. - * - * On other machines that receive a serialized version of the reference, the behavior changes. The - * instance will keep track of the TransportClient that sent the reference, so that messages - * to the endpoint are sent over the client connection, instead of needing a new connection to - * be opened. - * - * The RpcAddress of this ref can be null; what that means is that the ref can only be used through - * a client connection, since the process hosting the endpoint is not listening for incoming - * connections. These refs should not be shared with 3rd parties, since they will not be able to - * send messages to the endpoint. - * - * @param conf Spark configuration. - * @param endpointAddress The address where the endpoint is listening. - * @param nettyEnv The RpcEnv associated with this ref. - * @param local Whether the referenced endpoint lives in the same process. - */ -private[netty] class NettyRpcEndpointRef( - @transient private val conf: SparkConf, - endpointAddress: RpcEndpointAddress, - @transient @volatile private var nettyEnv: NettyRpcEnv) - extends RpcEndpointRef(conf) with Serializable with Logging { - - @transient @volatile var client: TransportClient = _ - - private val _address = if (endpointAddress.rpcAddress != null) endpointAddress else null - private val _name = endpointAddress.name - - override def address: RpcAddress = if (_address != null) _address.rpcAddress else null - - private def readObject(in: ObjectInputStream): Unit = { - in.defaultReadObject() - nettyEnv = NettyRpcEnv.currentEnv.value - client = NettyRpcEnv.currentClient.value - } - - private def writeObject(out: ObjectOutputStream): Unit = { - out.defaultWriteObject() - } - - override def name: String = _name - - override def ask[T: ClassTag](message: Any, timeout: RpcTimeout): Future[T] = { - val promise = Promise[Any]() - val timeoutCancelable = nettyEnv.timeoutScheduler.schedule(new Runnable { - override def run(): Unit = { - promise.tryFailure(new TimeoutException("Cannot receive any reply in " + timeout.duration)) - } - }, timeout.duration.toNanos, TimeUnit.NANOSECONDS) - val f = nettyEnv.ask(RequestMessage(nettyEnv.address, this, message, true)) - f.onComplete { v => - timeoutCancelable.cancel(true) - if (!promise.tryComplete(v)) { - logWarning(s"Ignore message $v") - } - }(ThreadUtils.sameThread) - promise.future.mapTo[T].recover(timeout.addMessageIfTimeout)(ThreadUtils.sameThread) - } - - override def send(message: Any): Unit = { - require(message != null, "Message is null") - nettyEnv.send(RequestMessage(nettyEnv.address, this, message, false)) - } - - override def toString: String = s"NettyRpcEndpointRef(${_address})" - - def toURI: URI = new URI(s"spark://${_address}") - - final override def equals(that: Any): Boolean = that match { - case other: NettyRpcEndpointRef => _address == other._address - case _ => false - } - - final override def hashCode(): Int = if (_address == null) 0 else _address.hashCode() -} - -/** - * The message that is sent from the sender to the receiver. - */ -private[netty] case class RequestMessage( - senderAddress: RpcAddress, receiver: NettyRpcEndpointRef, content: Any, needReply: Boolean) - -/** - * The base trait for all messages that are sent back from the receiver to the sender. - */ -private[netty] trait ResponseMessage - -/** - * The reply for `ask` from the receiver side. - */ -private[netty] case class AskResponse(sender: NettyRpcEndpointRef, reply: Any) - extends ResponseMessage - -/** - * A message to send back to the receiver side. It's necessary because [[TransportClient]] only - * clean the resources when it receives a reply. - */ -private[netty] case class Ack(sender: NettyRpcEndpointRef) extends ResponseMessage - -/** - * A response that indicates some failure happens in the receiver side. - */ -private[netty] case class RpcFailure(e: Throwable) - -/** - * Dispatches incoming RPCs to registered endpoints. - * - * The handler keeps track of all client instances that communicate with it, so that the RpcEnv - * knows which `TransportClient` instance to use when sending RPCs to a client endpoint (i.e., - * one that is not listening for incoming connections, but rather needs to be contacted via the - * client socket). - * - * Events are sent on a per-connection basis, so if a client opens multiple connections to the - * RpcEnv, multiple connection / disconnection events will be created for that client (albeit - * with different `RpcAddress` information). - */ -private[netty] class NettyRpcHandler( - dispatcher: Dispatcher, nettyEnv: NettyRpcEnv) extends RpcHandler with Logging { - - // TODO: Can we add connection callback (channel registered) to the underlying framework? - // A variable to track whether we should dispatch the RemoteProcessConnected message. - private val clients = new ConcurrentHashMap[TransportClient, JBoolean]() - - override def receive( - client: TransportClient, - message: Array[Byte], - callback: RpcResponseCallback): Unit = { - val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress] - assert(addr != null) - val clientAddr = RpcAddress(addr.getHostName, addr.getPort) - if (clients.putIfAbsent(client, JBoolean.TRUE) == null) { - dispatcher.postToAll(RemoteProcessConnected(clientAddr)) - } - val requestMessage = nettyEnv.deserialize[RequestMessage](client, message) - val messageToDispatch = if (requestMessage.senderAddress == null) { - // Create a new message with the socket address of the client as the sender. - RequestMessage(clientAddr, requestMessage.receiver, requestMessage.content, - requestMessage.needReply) - } else { - requestMessage - } - dispatcher.postRemoteMessage(messageToDispatch, callback) - } - - override def getStreamManager: StreamManager = new OneForOneStreamManager - - override def exceptionCaught(cause: Throwable, client: TransportClient): Unit = { - val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress] - if (addr != null) { - val clientAddr = RpcAddress(addr.getHostName, addr.getPort) - dispatcher.postToAll(RemoteProcessConnectionError(cause, clientAddr)) - } else { - // If the channel is closed before connecting, its remoteAddress will be null. - // See java.net.Socket.getRemoteSocketAddress - // Because we cannot get a RpcAddress, just log it - logError("Exception before connecting to the client", cause) - } - } - - override def connectionTerminated(client: TransportClient): Unit = { - val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress] - if (addr != null) { - val clientAddr = RpcAddress(addr.getHostName, addr.getPort) - clients.remove(client) - nettyEnv.removeOutbox(clientAddr) - dispatcher.postToAll(RemoteProcessDisconnected(clientAddr)) - } else { - // If the channel is closed before connecting, its remoteAddress will be null. In this case, - // we can ignore it since we don't fire "Associated". - // See java.net.Socket.getRemoteSocketAddress - } - } -} diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala deleted file mode 100644 index 2f6817f2eb935..0000000000000 --- a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc.netty - -import java.util.concurrent.Callable -import javax.annotation.concurrent.GuardedBy - -import scala.util.control.NonFatal - -import org.apache.spark.SparkException -import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} -import org.apache.spark.rpc.RpcAddress - -private[netty] case class OutboxMessage(content: Array[Byte], - _onFailure: (Throwable) => Unit, - _onSuccess: (TransportClient, Array[Byte]) => Unit) { - - def createCallback(client: TransportClient): RpcResponseCallback = new RpcResponseCallback() { - override def onFailure(e: Throwable): Unit = { - _onFailure(e) - } - - override def onSuccess(response: Array[Byte]): Unit = { - _onSuccess(client, response) - } - } - -} - -private[netty] class Outbox(nettyEnv: NettyRpcEnv, val address: RpcAddress) { - - outbox => // Give this an alias so we can use it more clearly in closures. - - @GuardedBy("this") - private val messages = new java.util.LinkedList[OutboxMessage] - - @GuardedBy("this") - private var client: TransportClient = null - - /** - * connectFuture points to the connect task. If there is no connect task, connectFuture will be - * null. - */ - @GuardedBy("this") - private var connectFuture: java.util.concurrent.Future[Unit] = null - - @GuardedBy("this") - private var stopped = false - - /** - * If there is any thread draining the message queue - */ - @GuardedBy("this") - private var draining = false - - /** - * Send a message. If there is no active connection, cache it and launch a new connection. If - * [[Outbox]] is stopped, the sender will be notified with a [[SparkException]]. - */ - def send(message: OutboxMessage): Unit = { - val dropped = synchronized { - if (stopped) { - true - } else { - messages.add(message) - false - } - } - if (dropped) { - message._onFailure(new SparkException("Message is dropped because Outbox is stopped")) - } else { - drainOutbox() - } - } - - /** - * Drain the message queue. If there is other draining thread, just exit. If the connection has - * not been established, launch a task in the `nettyEnv.clientConnectionExecutor` to setup the - * connection. - */ - private def drainOutbox(): Unit = { - var message: OutboxMessage = null - synchronized { - if (stopped) { - return - } - if (connectFuture != null) { - // We are connecting to the remote address, so just exit - return - } - if (client == null) { - // There is no connect task but client is null, so we need to launch the connect task. - launchConnectTask() - return - } - if (draining) { - // There is some thread draining, so just exit - return - } - message = messages.poll() - if (message == null) { - return - } - draining = true - } - while (true) { - try { - val _client = synchronized { client } - if (_client != null) { - _client.sendRpc(message.content, message.createCallback(_client)) - } else { - assert(stopped == true) - } - } catch { - case NonFatal(e) => - handleNetworkFailure(e) - return - } - synchronized { - if (stopped) { - return - } - message = messages.poll() - if (message == null) { - draining = false - return - } - } - } - } - - private def launchConnectTask(): Unit = { - connectFuture = nettyEnv.clientConnectionExecutor.submit(new Callable[Unit] { - - override def call(): Unit = { - try { - val _client = nettyEnv.createClient(address) - outbox.synchronized { - client = _client - if (stopped) { - closeClient() - } - } - } catch { - case ie: InterruptedException => - // exit - return - case NonFatal(e) => - outbox.synchronized { connectFuture = null } - handleNetworkFailure(e) - return - } - outbox.synchronized { connectFuture = null } - // It's possible that no thread is draining now. If we don't drain here, we cannot send the - // messages until the next message arrives. - drainOutbox() - } - }) - } - - /** - * Stop [[Inbox]] and notify the waiting messages with the cause. - */ - private def handleNetworkFailure(e: Throwable): Unit = { - synchronized { - assert(connectFuture == null) - if (stopped) { - return - } - stopped = true - closeClient() - } - // Remove this Outbox from nettyEnv so that the further messages will create a new Outbox along - // with a new connection - nettyEnv.removeOutbox(address) - - // Notify the connection failure for the remaining messages - // - // We always check `stopped` before updating messages, so here we can make sure no thread will - // update messages and it's safe to just drain the queue. - var message = messages.poll() - while (message != null) { - message._onFailure(e) - message = messages.poll() - } - assert(messages.isEmpty) - } - - private def closeClient(): Unit = synchronized { - // Not sure if `client.close` is idempotent. Just for safety. - if (client != null) { - client.close() - } - client = null - } - - /** - * Stop [[Outbox]]. The remaining messages in the [[Outbox]] will be notified with a - * [[SparkException]]. - */ - def stop(): Unit = { - synchronized { - if (stopped) { - return - } - stopped = true - if (connectFuture != null) { - connectFuture.cancel(true) - } - closeClient() - } - - // We always check `stopped` before updating messages, so here we can make sure no thread will - // update messages and it's safe to just drain the queue. - var message = messages.poll() - while (message != null) { - message._onFailure(new SparkException("Message is dropped because Outbox is stopped")) - message = messages.poll() - } - } -} diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala deleted file mode 100644 index d2e94f943aba5..0000000000000 --- a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc.netty - -import org.apache.spark.SparkException -import org.apache.spark.rpc.RpcAddress - -/** - * An address identifier for an RPC endpoint. - * - * The `rpcAddress` may be null, in which case the endpoint is registered via a client-only - * connection and can only be reached via the client that sent the endpoint reference. - * - * @param rpcAddress The socket address of the endpint. - * @param name Name of the endpoint. - */ -private[netty] case class RpcEndpointAddress(val rpcAddress: RpcAddress, val name: String) { - - require(name != null, "RpcEndpoint name must be provided.") - - def this(host: String, port: Int, name: String) = { - this(RpcAddress(host, port), name) - } - - override val toString = if (rpcAddress != null) { - s"spark://$name@${rpcAddress.host}:${rpcAddress.port}" - } else { - s"spark-client://$name" - } -} - -private[netty] object RpcEndpointAddress { - - def apply(sparkUrl: String): RpcEndpointAddress = { - try { - val uri = new java.net.URI(sparkUrl) - val host = uri.getHost - val port = uri.getPort - val name = uri.getUserInfo - if (uri.getScheme != "spark" || - host == null || - port < 0 || - name == null || - (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null - uri.getFragment != null || - uri.getQuery != null) { - throw new SparkException("Invalid Spark URL: " + sparkUrl) - } - new RpcEndpointAddress(host, port, name) - } catch { - case e: java.net.URISyntaxException => - throw new SparkException("Invalid Spark URL: " + sparkUrl, e) - } - } -} diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala deleted file mode 100644 index 99f20da2d66aa..0000000000000 --- a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc.netty - -import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv} - -/** - * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an [[RpcEndpoint]] exists. - * - * This is used when setting up a remote endpoint reference. - */ -private[netty] class RpcEndpointVerifier(override val rpcEnv: RpcEnv, dispatcher: Dispatcher) - extends RpcEndpoint { - - override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { - case RpcEndpointVerifier.CheckExistence(name) => context.reply(dispatcher.verify(name)) - } -} - -private[netty] object RpcEndpointVerifier { - val NAME = "endpoint-verifier" - - /** A message used to ask the remote [[RpcEndpointVerifier]] if an [[RpcEndpoint]] exists. */ - case class CheckExistence(name: String) -} diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 4a9518fff4e7b..7adba79fae2f4 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -184,6 +184,22 @@ class DAGScheduler( private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this) taskScheduler.setDAGScheduler(this) + // Flag to control if reduce tasks are assigned preferred locations + private val shuffleLocalityEnabled = + sc.getConf.getBoolean("spark.shuffle.reduceLocality.enabled", true) + // Number of map, reduce tasks above which we do not assign preferred locations + // based on map output sizes. We limit the size of jobs for which assign preferred locations + // as computing the top locations by size becomes expensive. + private[this] val SHUFFLE_PREF_MAP_THRESHOLD = 1000 + // NOTE: This should be less than 2000 as we use HighlyCompressedMapStatus beyond that + private[this] val SHUFFLE_PREF_REDUCE_THRESHOLD = 1000 + + // Fraction of total map output that must be at a location for it to considered as a preferred + // location for a reduce task. + // Making this larger will focus on fewer locations where most data can be read locally, but + // may lead to more delay in scheduling if those locations are busy. + private[scheduler] val REDUCER_PREF_LOCS_FRACTION = 0.2 + /** * Called by the TaskSetManager to report task's starting. */ @@ -1555,10 +1571,25 @@ class DAGScheduler( return locs } } - case _ => } + // If the RDD has shuffle dependencies and shuffle locality is enabled, pick locations that + // have at least REDUCER_PREF_LOCS_FRACTION of data as preferred locations + if (shuffleLocalityEnabled && rdd.partitions.length < SHUFFLE_PREF_REDUCE_THRESHOLD) { + rdd.dependencies.foreach { + case s: ShuffleDependency[_, _, _] => + if (s.rdd.partitions.length < SHUFFLE_PREF_MAP_THRESHOLD) { + // Get the preferred map output locations for this reducer + val topLocsForReducer = mapOutputTracker.getLocationsWithLargestOutputs(s.shuffleId, + partition, rdd.partitions.length, REDUCER_PREF_LOCS_FRACTION) + if (topLocsForReducer.nonEmpty) { + return topLocsForReducer.get.map(loc => TaskLocation(loc.host, loc.executorId)) + } + } + case _ => + } + } Nil } diff --git a/core/src/main/scala/org/apache/spark/shuffle/BaseShuffleHandle.scala b/core/src/main/scala/org/apache/spark/shuffle/BaseShuffleHandle.scala deleted file mode 100644 index b36c457d6d514..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/BaseShuffleHandle.scala +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle - -import org.apache.spark.{ShuffleDependency, Aggregator, Partitioner} -import org.apache.spark.serializer.Serializer - -/** - * A basic ShuffleHandle implementation that just captures registerShuffle's parameters. - */ -private[spark] class BaseShuffleHandle[K, V, C]( - shuffleId: Int, - val numMaps: Int, - val dependency: ShuffleDependency[K, V, C]) - extends ShuffleHandle(shuffleId) diff --git a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala deleted file mode 100644 index b0abda4a81b8d..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle - -import org.apache.spark._ -import org.apache.spark.serializer.Serializer -import org.apache.spark.storage.{BlockManager, ShuffleBlockFetcherIterator} -import org.apache.spark.util.CompletionIterator -import org.apache.spark.util.collection.ExternalSorter - -/** - * Fetches and reads the partitions in range [startPartition, endPartition) from a shuffle by - * requesting them from other nodes' block stores. - */ -private[spark] class BlockStoreShuffleReader[K, C]( - handle: BaseShuffleHandle[K, _, C], - startPartition: Int, - endPartition: Int, - context: TaskContext, - blockManager: BlockManager = SparkEnv.get.blockManager, - mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker) - extends ShuffleReader[K, C] with Logging { - - private val dep = handle.dependency - - /** Read the combined key-values for this reduce task */ - override def read(): Iterator[Product2[K, C]] = { - val blockFetcherItr = new ShuffleBlockFetcherIterator( - context, - blockManager.shuffleClient, - blockManager, - mapOutputTracker.getMapSizesByExecutorId(handle.shuffleId, startPartition, endPartition), - // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility - SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024) - - // Wrap the streams for compression based on configuration - val wrappedStreams = blockFetcherItr.map { case (blockId, inputStream) => - blockManager.wrapForCompression(blockId, inputStream) - } - - val ser = Serializer.getSerializer(dep.serializer) - val serializerInstance = ser.newInstance() - - // Create a key/value iterator for each stream - val recordIter = wrappedStreams.flatMap { wrappedStream => - // Note: the asKeyValueIterator below wraps a key/value iterator inside of a - // NextIterator. The NextIterator makes sure that close() is called on the - // underlying InputStream when all records have been read. - serializerInstance.deserializeStream(wrappedStream).asKeyValueIterator - } - - // Update the context task metrics for each record read. - val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency() - val metricIter = CompletionIterator[(Any, Any), Iterator[(Any, Any)]]( - recordIter.map(record => { - readMetrics.incRecordsRead(1) - record - }), - context.taskMetrics().updateShuffleReadMetrics()) - - // An interruptible iterator must be used here in order to support task cancellation - val interruptibleIter = new InterruptibleIterator[(Any, Any)](context, metricIter) - - val aggregatedIter: Iterator[Product2[K, C]] = if (dep.aggregator.isDefined) { - if (dep.mapSideCombine) { - // We are reading values that are already combined - val combinedKeyValuesIterator = interruptibleIter.asInstanceOf[Iterator[(K, C)]] - dep.aggregator.get.combineCombinersByKey(combinedKeyValuesIterator, context) - } else { - // We don't know the value type, but also don't care -- the dependency *should* - // have made sure its compatible w/ this aggregator, which will convert the value - // type to the combined type C - val keyValuesIterator = interruptibleIter.asInstanceOf[Iterator[(K, Nothing)]] - dep.aggregator.get.combineValuesByKey(keyValuesIterator, context) - } - } else { - require(!dep.mapSideCombine, "Map-side combine without Aggregator specified!") - interruptibleIter.asInstanceOf[Iterator[Product2[K, C]]] - } - - // Sort the output if there is a sort ordering defined. - dep.keyOrdering match { - case Some(keyOrd: Ordering[K]) => - // Create an ExternalSorter to sort the data. Note that if spark.shuffle.spill is disabled, - // the ExternalSorter won't spill to disk. - val sorter = - new ExternalSorter[K, C, C](context, ordering = Some(keyOrd), serializer = Some(ser)) - sorter.insertAll(aggregatedIter) - context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) - context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) - context.internalMetricsToAccumulators( - InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.peakMemoryUsedBytes) - CompletionIterator[Product2[K, C], Iterator[Product2[K, C]]](sorter.iterator, sorter.stop()) - case None => - aggregatedIter - } - } -} diff --git a/core/src/main/scala/org/apache/spark/shuffle/FetchFailedException.scala b/core/src/main/scala/org/apache/spark/shuffle/FetchFailedException.scala deleted file mode 100644 index be184464e0ae9..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/FetchFailedException.scala +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle - -import org.apache.spark.storage.BlockManagerId -import org.apache.spark.{FetchFailed, TaskEndReason} -import org.apache.spark.util.Utils - -/** - * Failed to fetch a shuffle block. The executor catches this exception and propagates it - * back to DAGScheduler (through TaskEndReason) so we'd resubmit the previous stage. - * - * Note that bmAddress can be null. - */ -private[spark] class FetchFailedException( - bmAddress: BlockManagerId, - shuffleId: Int, - mapId: Int, - reduceId: Int, - message: String, - cause: Throwable = null) - extends Exception(message, cause) { - - def this( - bmAddress: BlockManagerId, - shuffleId: Int, - mapId: Int, - reduceId: Int, - cause: Throwable) { - this(bmAddress, shuffleId, mapId, reduceId, cause.getMessage, cause) - } - - def toTaskEndReason: TaskEndReason = FetchFailed(bmAddress, shuffleId, mapId, reduceId, - Utils.exceptionString(this)) -} - -/** - * Failed to get shuffle metadata from [[org.apache.spark.MapOutputTracker]]. - */ -private[spark] class MetadataFetchFailedException( - shuffleId: Int, - reduceId: Int, - message: String) - extends FetchFailedException(null, shuffleId, -1, reduceId, message) diff --git a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala deleted file mode 100644 index cd253a78c2b19..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle - -import java.util.concurrent.ConcurrentLinkedQueue - -import scala.collection.JavaConverters._ - -import org.apache.spark.{Logging, SparkConf, SparkEnv} -import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} -import org.apache.spark.network.netty.SparkTransportConf -import org.apache.spark.serializer.Serializer -import org.apache.spark.storage._ -import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap} - -/** A group of writers for a ShuffleMapTask, one writer per reducer. */ -private[spark] trait ShuffleWriterGroup { - val writers: Array[DiskBlockObjectWriter] - - /** @param success Indicates all writes were successful. If false, no blocks will be recorded. */ - def releaseWriters(success: Boolean) -} - -/** - * Manages assigning disk-based block writers to shuffle tasks. Each shuffle task gets one file - * per reducer. - */ -// Note: Changes to the format in this file should be kept in sync with -// org.apache.spark.network.shuffle.ExternalShuffleBlockResolver#getHashBasedShuffleBlockData(). -private[spark] class FileShuffleBlockResolver(conf: SparkConf) - extends ShuffleBlockResolver with Logging { - - private val transportConf = SparkTransportConf.fromSparkConf(conf) - - private lazy val blockManager = SparkEnv.get.blockManager - - // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided - private val bufferSize = conf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024 - - /** - * Contains all the state related to a particular shuffle. - */ - private class ShuffleState(val numReducers: Int) { - /** - * The mapIds of all map tasks completed on this Executor for this shuffle. - */ - val completedMapTasks = new ConcurrentLinkedQueue[Int]() - } - - private val shuffleStates = new TimeStampedHashMap[ShuffleId, ShuffleState] - - private val metadataCleaner = - new MetadataCleaner(MetadataCleanerType.SHUFFLE_BLOCK_MANAGER, this.cleanup, conf) - - /** - * Get a ShuffleWriterGroup for the given map task, which will register it as complete - * when the writers are closed successfully - */ - def forMapTask(shuffleId: Int, mapId: Int, numReducers: Int, serializer: Serializer, - writeMetrics: ShuffleWriteMetrics): ShuffleWriterGroup = { - new ShuffleWriterGroup { - shuffleStates.putIfAbsent(shuffleId, new ShuffleState(numReducers)) - private val shuffleState = shuffleStates(shuffleId) - - val openStartTime = System.nanoTime - val serializerInstance = serializer.newInstance() - val writers: Array[DiskBlockObjectWriter] = { - Array.tabulate[DiskBlockObjectWriter](numReducers) { bucketId => - val blockId = ShuffleBlockId(shuffleId, mapId, bucketId) - val blockFile = blockManager.diskBlockManager.getFile(blockId) - // Because of previous failures, the shuffle file may already exist on this machine. - // If so, remove it. - if (blockFile.exists) { - if (blockFile.delete()) { - logInfo(s"Removed existing shuffle file $blockFile") - } else { - logWarning(s"Failed to remove existing shuffle file $blockFile") - } - } - blockManager.getDiskWriter(blockId, blockFile, serializerInstance, bufferSize, - writeMetrics) - } - } - // Creating the file to write to and creating a disk writer both involve interacting with - // the disk, so should be included in the shuffle write time. - writeMetrics.incShuffleWriteTime(System.nanoTime - openStartTime) - - override def releaseWriters(success: Boolean) { - shuffleState.completedMapTasks.add(mapId) - } - } - } - - override def getBlockData(blockId: ShuffleBlockId): ManagedBuffer = { - val file = blockManager.diskBlockManager.getFile(blockId) - new FileSegmentManagedBuffer(transportConf, file, 0, file.length) - } - - /** Remove all the blocks / files and metadata related to a particular shuffle. */ - def removeShuffle(shuffleId: ShuffleId): Boolean = { - // Do not change the ordering of this, if shuffleStates should be removed only - // after the corresponding shuffle blocks have been removed - val cleaned = removeShuffleBlocks(shuffleId) - shuffleStates.remove(shuffleId) - cleaned - } - - /** Remove all the blocks / files related to a particular shuffle. */ - private def removeShuffleBlocks(shuffleId: ShuffleId): Boolean = { - shuffleStates.get(shuffleId) match { - case Some(state) => - for (mapId <- state.completedMapTasks.asScala; reduceId <- 0 until state.numReducers) { - val blockId = new ShuffleBlockId(shuffleId, mapId, reduceId) - val file = blockManager.diskBlockManager.getFile(blockId) - if (!file.delete()) { - logWarning(s"Error deleting ${file.getPath()}") - } - } - logInfo("Deleted all files for shuffle " + shuffleId) - true - case None => - logInfo("Could not find files for shuffle " + shuffleId + " for deleting") - false - } - } - - private def cleanup(cleanupTime: Long) { - shuffleStates.clearOldValues(cleanupTime, (shuffleId, state) => removeShuffleBlocks(shuffleId)) - } - - override def stop() { - metadataCleaner.cancel() - } -} diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala deleted file mode 100644 index 5e4c2b5d0a5c4..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle - -import java.io._ - -import com.google.common.io.ByteStreams - -import org.apache.spark.{SparkConf, SparkEnv, Logging} -import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} -import org.apache.spark.network.netty.SparkTransportConf -import org.apache.spark.storage._ -import org.apache.spark.util.Utils - -import IndexShuffleBlockResolver.NOOP_REDUCE_ID - -/** - * Create and maintain the shuffle blocks' mapping between logic block and physical file location. - * Data of shuffle blocks from the same map task are stored in a single consolidated data file. - * The offsets of the data blocks in the data file are stored in a separate index file. - * - * We use the name of the shuffle data's shuffleBlockId with reduce ID set to 0 and add ".data" - * as the filename postfix for data file, and ".index" as the filename postfix for index file. - * - */ -// Note: Changes to the format in this file should be kept in sync with -// org.apache.spark.network.shuffle.ExternalShuffleBlockResolver#getSortBasedShuffleBlockData(). -private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleBlockResolver - with Logging { - - private lazy val blockManager = SparkEnv.get.blockManager - - private val transportConf = SparkTransportConf.fromSparkConf(conf) - - def getDataFile(shuffleId: Int, mapId: Int): File = { - blockManager.diskBlockManager.getFile(ShuffleDataBlockId(shuffleId, mapId, NOOP_REDUCE_ID)) - } - - private def getIndexFile(shuffleId: Int, mapId: Int): File = { - blockManager.diskBlockManager.getFile(ShuffleIndexBlockId(shuffleId, mapId, NOOP_REDUCE_ID)) - } - - /** - * Remove data file and index file that contain the output data from one map. - * */ - def removeDataByMap(shuffleId: Int, mapId: Int): Unit = { - var file = getDataFile(shuffleId, mapId) - if (file.exists()) { - if (!file.delete()) { - logWarning(s"Error deleting data ${file.getPath()}") - } - } - - file = getIndexFile(shuffleId, mapId) - if (file.exists()) { - if (!file.delete()) { - logWarning(s"Error deleting index ${file.getPath()}") - } - } - } - - /** - * Write an index file with the offsets of each block, plus a final offset at the end for the - * end of the output file. This will be used by getBlockData to figure out where each block - * begins and ends. - * */ - def writeIndexFile(shuffleId: Int, mapId: Int, lengths: Array[Long]): Unit = { - val indexFile = getIndexFile(shuffleId, mapId) - val out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(indexFile))) - Utils.tryWithSafeFinally { - // We take in lengths of each block, need to convert it to offsets. - var offset = 0L - out.writeLong(offset) - for (length <- lengths) { - offset += length - out.writeLong(offset) - } - } { - out.close() - } - } - - override def getBlockData(blockId: ShuffleBlockId): ManagedBuffer = { - // The block is actually going to be a range of a single map output file for this map, so - // find out the consolidated file, then the offset within that from our index - val indexFile = getIndexFile(blockId.shuffleId, blockId.mapId) - - val in = new DataInputStream(new FileInputStream(indexFile)) - try { - ByteStreams.skipFully(in, blockId.reduceId * 8) - val offset = in.readLong() - val nextOffset = in.readLong() - new FileSegmentManagedBuffer( - transportConf, - getDataFile(blockId.shuffleId, blockId.mapId), - offset, - nextOffset - offset) - } finally { - in.close() - } - } - - override def stop(): Unit = {} -} - -private[spark] object IndexShuffleBlockResolver { - // No-op reduce ID used in interactions with disk store. - // The disk store currently expects puts to relate to a (map, reduce) pair, but in the sort - // shuffle outputs for several reduces are glommed into a single file. - val NOOP_REDUCE_ID = 0 -} diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockResolver.scala deleted file mode 100644 index 4342b0d598b16..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockResolver.scala +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle - -import java.nio.ByteBuffer -import org.apache.spark.network.buffer.ManagedBuffer -import org.apache.spark.storage.ShuffleBlockId - -private[spark] -/** - * Implementers of this trait understand how to retrieve block data for a logical shuffle block - * identifier (i.e. map, reduce, and shuffle). Implementations may use files or file segments to - * encapsulate shuffle data. This is used by the BlockStore to abstract over different shuffle - * implementations when shuffle data is retrieved. - */ -trait ShuffleBlockResolver { - type ShuffleId = Int - - /** - * Retrieve the data for the specified block. If the data for that block is not available, - * throws an unspecified exception. - */ - def getBlockData(blockId: ShuffleBlockId): ManagedBuffer - - def stop(): Unit -} diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleHandle.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleHandle.scala deleted file mode 100644 index e04c97fe61894..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleHandle.scala +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle - -import org.apache.spark.annotation.DeveloperApi - -/** - * An opaque handle to a shuffle, used by a ShuffleManager to pass information about it to tasks. - * - * @param shuffleId ID of the shuffle - */ -@DeveloperApi -abstract class ShuffleHandle(val shuffleId: Int) extends Serializable {} diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala deleted file mode 100644 index 978366d1a1d1b..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle - -import org.apache.spark.{TaskContext, ShuffleDependency} - -/** - * Pluggable interface for shuffle systems. A ShuffleManager is created in SparkEnv on the driver - * and on each executor, based on the spark.shuffle.manager setting. The driver registers shuffles - * with it, and executors (or tasks running locally in the driver) can ask to read and write data. - * - * NOTE: this will be instantiated by SparkEnv so its constructor can take a SparkConf and - * boolean isDriver as parameters. - */ -private[spark] trait ShuffleManager { - /** - * Register a shuffle with the manager and obtain a handle for it to pass to tasks. - */ - def registerShuffle[K, V, C]( - shuffleId: Int, - numMaps: Int, - dependency: ShuffleDependency[K, V, C]): ShuffleHandle - - /** Get a writer for a given partition. Called on executors by map tasks. */ - def getWriter[K, V](handle: ShuffleHandle, mapId: Int, context: TaskContext): ShuffleWriter[K, V] - - /** - * Get a reader for a range of reduce partitions (startPartition to endPartition-1, inclusive). - * Called on executors by reduce tasks. - */ - def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext): ShuffleReader[K, C] - - /** - * Remove a shuffle's metadata from the ShuffleManager. - * @return true if the metadata removed successfully, otherwise false. - */ - def unregisterShuffle(shuffleId: Int): Boolean - - /** - * Return a resolver capable of retrieving shuffle block data based on block coordinates. - */ - def shuffleBlockResolver: ShuffleBlockResolver - - /** Shut down this ShuffleManager. */ - def stop(): Unit -} diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleReader.scala deleted file mode 100644 index 292e48314ee10..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleReader.scala +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle - -/** - * Obtained inside a reduce task to read combined records from the mappers. - */ -private[spark] trait ShuffleReader[K, C] { - /** Read the combined key-values for this reduce task */ - def read(): Iterator[Product2[K, C]] - - /** - * Close this reader. - * TODO: Add this back when we make the ShuffleReader a developer API that others can implement - * (at which point this will likely be necessary). - */ - // def stop(): Unit -} diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriter.scala deleted file mode 100644 index 4cc4ef5f1886e..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriter.scala +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle - -import java.io.IOException - -import org.apache.spark.scheduler.MapStatus - -/** - * Obtained inside a map task to write out records to the shuffle system. - */ -private[spark] abstract class ShuffleWriter[K, V] { - /** Write a sequence of records to this task's output */ - @throws[IOException] - def write(records: Iterator[Product2[K, V]]): Unit - - /** Close this writer, passing along whether the map completed */ - def stop(success: Boolean): Option[MapStatus] -} diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala deleted file mode 100644 index d2e2fc4c110a7..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle.hash - -import org.apache.spark._ -import org.apache.spark.shuffle._ - -/** - * A ShuffleManager using hashing, that creates one output file per reduce partition on each - * mapper (possibly reusing these across waves of tasks). - */ -private[spark] class HashShuffleManager(conf: SparkConf) extends ShuffleManager with Logging { - - if (!conf.getBoolean("spark.shuffle.spill", true)) { - logWarning( - "spark.shuffle.spill was set to false, but this configuration is ignored as of Spark 1.6+." + - " Shuffle will continue to spill to disk when necessary.") - } - - private val fileShuffleBlockResolver = new FileShuffleBlockResolver(conf) - - /* Register a shuffle with the manager and obtain a handle for it to pass to tasks. */ - override def registerShuffle[K, V, C]( - shuffleId: Int, - numMaps: Int, - dependency: ShuffleDependency[K, V, C]): ShuffleHandle = { - new BaseShuffleHandle(shuffleId, numMaps, dependency) - } - - /** - * Get a reader for a range of reduce partitions (startPartition to endPartition-1, inclusive). - * Called on executors by reduce tasks. - */ - override def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext): ShuffleReader[K, C] = { - new BlockStoreShuffleReader( - handle.asInstanceOf[BaseShuffleHandle[K, _, C]], startPartition, endPartition, context) - } - - /** Get a writer for a given partition. Called on executors by map tasks. */ - override def getWriter[K, V](handle: ShuffleHandle, mapId: Int, context: TaskContext) - : ShuffleWriter[K, V] = { - new HashShuffleWriter( - shuffleBlockResolver, handle.asInstanceOf[BaseShuffleHandle[K, V, _]], mapId, context) - } - - /** Remove a shuffle's metadata from the ShuffleManager. */ - override def unregisterShuffle(shuffleId: Int): Boolean = { - shuffleBlockResolver.removeShuffle(shuffleId) - } - - override def shuffleBlockResolver: FileShuffleBlockResolver = { - fileShuffleBlockResolver - } - - /** Shut down this ShuffleManager. */ - override def stop(): Unit = { - shuffleBlockResolver.stop() - } -} diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala deleted file mode 100644 index 41df70c602c30..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle.hash - -import org.apache.spark._ -import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.scheduler.MapStatus -import org.apache.spark.serializer.Serializer -import org.apache.spark.shuffle._ -import org.apache.spark.storage.DiskBlockObjectWriter - -private[spark] class HashShuffleWriter[K, V]( - shuffleBlockResolver: FileShuffleBlockResolver, - handle: BaseShuffleHandle[K, V, _], - mapId: Int, - context: TaskContext) - extends ShuffleWriter[K, V] with Logging { - - private val dep = handle.dependency - private val numOutputSplits = dep.partitioner.numPartitions - private val metrics = context.taskMetrics - - // Are we in the process of stopping? Because map tasks can call stop() with success = true - // and then call stop() with success = false if they get an exception, we want to make sure - // we don't try deleting files, etc twice. - private var stopping = false - - private val writeMetrics = new ShuffleWriteMetrics() - metrics.shuffleWriteMetrics = Some(writeMetrics) - - private val blockManager = SparkEnv.get.blockManager - private val ser = Serializer.getSerializer(dep.serializer.getOrElse(null)) - private val shuffle = shuffleBlockResolver.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser, - writeMetrics) - - /** Write a bunch of records to this task's output */ - override def write(records: Iterator[Product2[K, V]]): Unit = { - val iter = if (dep.aggregator.isDefined) { - if (dep.mapSideCombine) { - dep.aggregator.get.combineValuesByKey(records, context) - } else { - records - } - } else { - require(!dep.mapSideCombine, "Map-side combine without Aggregator specified!") - records - } - - for (elem <- iter) { - val bucketId = dep.partitioner.getPartition(elem._1) - shuffle.writers(bucketId).write(elem._1, elem._2) - } - } - - /** Close this writer, passing along whether the map completed */ - override def stop(initiallySuccess: Boolean): Option[MapStatus] = { - var success = initiallySuccess - try { - if (stopping) { - return None - } - stopping = true - if (success) { - try { - Some(commitWritesAndBuildStatus()) - } catch { - case e: Exception => - success = false - revertWrites() - throw e - } - } else { - revertWrites() - None - } - } finally { - // Release the writers back to the shuffle block manager. - if (shuffle != null && shuffle.writers != null) { - try { - shuffle.releaseWriters(success) - } catch { - case e: Exception => logError("Failed to release shuffle writers", e) - } - } - } - } - - private def commitWritesAndBuildStatus(): MapStatus = { - // Commit the writes. Get the size of each bucket block (total block size). - val sizes: Array[Long] = shuffle.writers.map { writer: DiskBlockObjectWriter => - writer.commitAndClose() - writer.fileSegment().length - } - MapStatus(blockManager.shuffleServerId, sizes) - } - - private def revertWrites(): Unit = { - if (shuffle != null && shuffle.writers != null) { - for (writer <- shuffle.writers) { - writer.revertPartialWritesAndClose() - } - } - } -} diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala deleted file mode 100644 index 66b6bbc61fe8e..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle.sort - -import java.util.concurrent.ConcurrentHashMap - -import org.apache.spark._ -import org.apache.spark.serializer.Serializer -import org.apache.spark.shuffle._ - -/** - * In sort-based shuffle, incoming records are sorted according to their target partition ids, then - * written to a single map output file. Reducers fetch contiguous regions of this file in order to - * read their portion of the map output. In cases where the map output data is too large to fit in - * memory, sorted subsets of the output can are spilled to disk and those on-disk files are merged - * to produce the final output file. - * - * Sort-based shuffle has two different write paths for producing its map output files: - * - * - Serialized sorting: used when all three of the following conditions hold: - * 1. The shuffle dependency specifies no aggregation or output ordering. - * 2. The shuffle serializer supports relocation of serialized values (this is currently - * supported by KryoSerializer and Spark SQL's custom serializers). - * 3. The shuffle produces fewer than 16777216 output partitions. - * - Deserialized sorting: used to handle all other cases. - * - * ----------------------- - * Serialized sorting mode - * ----------------------- - * - * In the serialized sorting mode, incoming records are serialized as soon as they are passed to the - * shuffle writer and are buffered in a serialized form during sorting. This write path implements - * several optimizations: - * - * - Its sort operates on serialized binary data rather than Java objects, which reduces memory - * consumption and GC overheads. This optimization requires the record serializer to have certain - * properties to allow serialized records to be re-ordered without requiring deserialization. - * See SPARK-4550, where this optimization was first proposed and implemented, for more details. - * - * - It uses a specialized cache-efficient sorter ([[ShuffleExternalSorter]]) that sorts - * arrays of compressed record pointers and partition ids. By using only 8 bytes of space per - * record in the sorting array, this fits more of the array into cache. - * - * - The spill merging procedure operates on blocks of serialized records that belong to the same - * partition and does not need to deserialize records during the merge. - * - * - When the spill compression codec supports concatenation of compressed data, the spill merge - * simply concatenates the serialized and compressed spill partitions to produce the final output - * partition. This allows efficient data copying methods, like NIO's `transferTo`, to be used - * and avoids the need to allocate decompression or copying buffers during the merge. - * - * For more details on these optimizations, see SPARK-7081. - */ -private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager with Logging { - - if (!conf.getBoolean("spark.shuffle.spill", true)) { - logWarning( - "spark.shuffle.spill was set to false, but this configuration is ignored as of Spark 1.6+." + - " Shuffle will continue to spill to disk when necessary.") - } - - /** - * A mapping from shuffle ids to the number of mappers producing output for those shuffles. - */ - private[this] val numMapsForShuffle = new ConcurrentHashMap[Int, Int]() - - override val shuffleBlockResolver = new IndexShuffleBlockResolver(conf) - - /** - * Register a shuffle with the manager and obtain a handle for it to pass to tasks. - */ - override def registerShuffle[K, V, C]( - shuffleId: Int, - numMaps: Int, - dependency: ShuffleDependency[K, V, C]): ShuffleHandle = { - if (SortShuffleWriter.shouldBypassMergeSort(SparkEnv.get.conf, dependency)) { - // If there are fewer than spark.shuffle.sort.bypassMergeThreshold partitions and we don't - // need map-side aggregation, then write numPartitions files directly and just concatenate - // them at the end. This avoids doing serialization and deserialization twice to merge - // together the spilled files, which would happen with the normal code path. The downside is - // having multiple files open at a time and thus more memory allocated to buffers. - new BypassMergeSortShuffleHandle[K, V]( - shuffleId, numMaps, dependency.asInstanceOf[ShuffleDependency[K, V, V]]) - } else if (SortShuffleManager.canUseSerializedShuffle(dependency)) { - // Otherwise, try to buffer map outputs in a serialized form, since this is more efficient: - new SerializedShuffleHandle[K, V]( - shuffleId, numMaps, dependency.asInstanceOf[ShuffleDependency[K, V, V]]) - } else { - // Otherwise, buffer map outputs in a deserialized form: - new BaseShuffleHandle(shuffleId, numMaps, dependency) - } - } - - /** - * Get a reader for a range of reduce partitions (startPartition to endPartition-1, inclusive). - * Called on executors by reduce tasks. - */ - override def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext): ShuffleReader[K, C] = { - new BlockStoreShuffleReader( - handle.asInstanceOf[BaseShuffleHandle[K, _, C]], startPartition, endPartition, context) - } - - /** Get a writer for a given partition. Called on executors by map tasks. */ - override def getWriter[K, V]( - handle: ShuffleHandle, - mapId: Int, - context: TaskContext): ShuffleWriter[K, V] = { - numMapsForShuffle.putIfAbsent( - handle.shuffleId, handle.asInstanceOf[BaseShuffleHandle[_, _, _]].numMaps) - val env = SparkEnv.get - handle match { - case unsafeShuffleHandle: SerializedShuffleHandle[K @unchecked, V @unchecked] => - new UnsafeShuffleWriter( - env.blockManager, - shuffleBlockResolver.asInstanceOf[IndexShuffleBlockResolver], - context.taskMemoryManager(), - unsafeShuffleHandle, - mapId, - context, - env.conf) - case bypassMergeSortHandle: BypassMergeSortShuffleHandle[K @unchecked, V @unchecked] => - new BypassMergeSortShuffleWriter( - env.blockManager, - shuffleBlockResolver.asInstanceOf[IndexShuffleBlockResolver], - bypassMergeSortHandle, - mapId, - context, - env.conf) - case other: BaseShuffleHandle[K @unchecked, V @unchecked, _] => - new SortShuffleWriter(shuffleBlockResolver, other, mapId, context) - } - } - - /** Remove a shuffle's metadata from the ShuffleManager. */ - override def unregisterShuffle(shuffleId: Int): Boolean = { - Option(numMapsForShuffle.remove(shuffleId)).foreach { numMaps => - (0 until numMaps).foreach { mapId => - shuffleBlockResolver.removeDataByMap(shuffleId, mapId) - } - } - true - } - - /** Shut down this ShuffleManager. */ - override def stop(): Unit = { - shuffleBlockResolver.stop() - } -} - - -private[spark] object SortShuffleManager extends Logging { - - /** - * The maximum number of shuffle output partitions that SortShuffleManager supports when - * buffering map outputs in a serialized form. This is an extreme defensive programming measure, - * since it's extremely unlikely that a single shuffle produces over 16 million output partitions. - * */ - val MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE = - PackedRecordPointer.MAXIMUM_PARTITION_ID + 1 - - /** - * Helper method for determining whether a shuffle should use an optimized serialized shuffle - * path or whether it should fall back to the original path that operates on deserialized objects. - */ - def canUseSerializedShuffle(dependency: ShuffleDependency[_, _, _]): Boolean = { - val shufId = dependency.shuffleId - val numPartitions = dependency.partitioner.numPartitions - val serializer = Serializer.getSerializer(dependency.serializer) - if (!serializer.supportsRelocationOfSerializedObjects) { - log.debug(s"Can't use serialized shuffle for shuffle $shufId because the serializer, " + - s"${serializer.getClass.getName}, does not support object relocation") - false - } else if (dependency.aggregator.isDefined) { - log.debug( - s"Can't use serialized shuffle for shuffle $shufId because an aggregator is defined") - false - } else if (numPartitions > MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE) { - log.debug(s"Can't use serialized shuffle for shuffle $shufId because it has more than " + - s"$MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE partitions") - false - } else { - log.debug(s"Can use serialized shuffle for shuffle $shufId") - true - } - } -} - -/** - * Subclass of [[BaseShuffleHandle]], used to identify when we've chosen to use the - * serialized shuffle. - */ -private[spark] class SerializedShuffleHandle[K, V]( - shuffleId: Int, - numMaps: Int, - dependency: ShuffleDependency[K, V, V]) - extends BaseShuffleHandle(shuffleId, numMaps, dependency) { -} - -/** - * Subclass of [[BaseShuffleHandle]], used to identify when we've chosen to use the - * bypass merge sort shuffle path. - */ -private[spark] class BypassMergeSortShuffleHandle[K, V]( - shuffleId: Int, - numMaps: Int, - dependency: ShuffleDependency[K, V, V]) - extends BaseShuffleHandle(shuffleId, numMaps, dependency) { -} diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala deleted file mode 100644 index 808317b017a0f..0000000000000 --- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle.sort - -import org.apache.spark._ -import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.scheduler.MapStatus -import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriter, BaseShuffleHandle} -import org.apache.spark.storage.ShuffleBlockId -import org.apache.spark.util.collection.ExternalSorter - -private[spark] class SortShuffleWriter[K, V, C]( - shuffleBlockResolver: IndexShuffleBlockResolver, - handle: BaseShuffleHandle[K, V, C], - mapId: Int, - context: TaskContext) - extends ShuffleWriter[K, V] with Logging { - - private val dep = handle.dependency - - private val blockManager = SparkEnv.get.blockManager - - private var sorter: ExternalSorter[K, V, _] = null - - // Are we in the process of stopping? Because map tasks can call stop() with success = true - // and then call stop() with success = false if they get an exception, we want to make sure - // we don't try deleting files, etc twice. - private var stopping = false - - private var mapStatus: MapStatus = null - - private val writeMetrics = new ShuffleWriteMetrics() - context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics) - - /** Write a bunch of records to this task's output */ - override def write(records: Iterator[Product2[K, V]]): Unit = { - sorter = if (dep.mapSideCombine) { - require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!") - new ExternalSorter[K, V, C]( - context, dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer) - } else { - // In this case we pass neither an aggregator nor an ordering to the sorter, because we don't - // care whether the keys get sorted in each partition; that will be done on the reduce side - // if the operation being run is sortByKey. - new ExternalSorter[K, V, V]( - context, aggregator = None, Some(dep.partitioner), ordering = None, dep.serializer) - } - sorter.insertAll(records) - - // Don't bother including the time to open the merged output file in the shuffle write time, - // because it just opens a single file, so is typically too fast to measure accurately - // (see SPARK-3570). - val outputFile = shuffleBlockResolver.getDataFile(dep.shuffleId, mapId) - val blockId = ShuffleBlockId(dep.shuffleId, mapId, IndexShuffleBlockResolver.NOOP_REDUCE_ID) - val partitionLengths = sorter.writePartitionedFile(blockId, outputFile) - shuffleBlockResolver.writeIndexFile(dep.shuffleId, mapId, partitionLengths) - - mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths) - } - - /** Close this writer, passing along whether the map completed */ - override def stop(success: Boolean): Option[MapStatus] = { - try { - if (stopping) { - return None - } - stopping = true - if (success) { - return Option(mapStatus) - } else { - // The map task failed, so delete our output data. - shuffleBlockResolver.removeDataByMap(dep.shuffleId, mapId) - return None - } - } finally { - // Clean up our sorter, which may have its own intermediate files - if (sorter != null) { - val startTime = System.nanoTime() - sorter.stop() - context.taskMetrics.shuffleWriteMetrics.foreach( - _.incShuffleWriteTime(System.nanoTime - startTime)) - sorter = null - } - } - } -} - -private[spark] object SortShuffleWriter { - def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = { - // We cannot bypass sorting if we need to do map-side aggregation. - if (dep.mapSideCombine) { - require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!") - false - } else { - val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) - dep.partitioner.numPartitions <= bypassMergeThreshold - } - } -} diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index c374b93766225..e7c64bfb02372 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -659,7 +659,7 @@ private[spark] class BlockManager( writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = { val compressStream: OutputStream => OutputStream = wrapForCompression(blockId, _) val syncWrites = conf.getBoolean("spark.shuffle.sync", false) - new DiskBlockObjectWriter(file, serializerInstance, bufferSize, compressStream, + new DiskBlockObjectWriter(blockId, file, serializerInstance, bufferSize, compressStream, syncWrites, writeMetrics) } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala index e749631bf6f19..7478ab0fc2f7a 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala @@ -19,7 +19,7 @@ package org.apache.spark.storage import scala.concurrent.{ExecutionContext, Future} -import org.apache.spark.rpc.{ThreadSafeRpcEndpoint, RpcEnv, RpcCallContext, RpcEndpoint} +import org.apache.spark.rpc.{RpcEnv, RpcCallContext, RpcEndpoint} import org.apache.spark.util.ThreadUtils import org.apache.spark.{Logging, MapOutputTracker, SparkEnv} import org.apache.spark.storage.BlockManagerMessages._ @@ -33,7 +33,7 @@ class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) - extends ThreadSafeRpcEndpoint with Logging { + extends RpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") @@ -80,7 +80,7 @@ class BlockManagerSlaveEndpoint( future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) context.reply(response) - logDebug("Sent response: " + response + " to " + context.senderAddress) + logDebug("Sent response: " + response + " to " + context.sender) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala index 80d426fadc65e..49d9154f95a5b 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala @@ -34,6 +34,7 @@ import org.apache.spark.util.Utils * reopened again. */ private[spark] class DiskBlockObjectWriter( + val blockId: BlockId, file: File, serializerInstance: SerializerInstance, bufferSize: Int, @@ -143,10 +144,8 @@ private[spark] class DiskBlockObjectWriter( * Reverts writes that haven't been flushed yet. Callers should invoke this function * when there are runtime exceptions. This method will not throw, though it may be * unsuccessful in truncating written data. - * - * @return the file that this DiskBlockObjectWriter wrote to. */ - def revertPartialWritesAndClose(): File = { + def revertPartialWritesAndClose() { // Discard current writes. We do this by flushing the outstanding writes and then // truncating the file to its initial position. try { @@ -161,14 +160,12 @@ private[spark] class DiskBlockObjectWriter( val truncateStream = new FileOutputStream(file, true) try { truncateStream.getChannel.truncate(initialPosition) - file } finally { truncateStream.close() } } catch { case e: Exception => logError("Uncaught exception while reverting partial writes to file " + file, e) - file } } diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index 53283448c87b1..395304c1c342b 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -84,11 +84,7 @@ private[spark] object ThreadUtils { */ def newDaemonSingleThreadScheduledExecutor(threadName: String): ScheduledExecutorService = { val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat(threadName).build() - val executor = new ScheduledThreadPoolExecutor(1, threadFactory) - // By default, a cancelled task is not automatically removed from the work queue until its delay - // elapses. We have to enable it manually. - executor.setRemoveOnCancelPolicy(true) - executor + Executors.newSingleThreadScheduledExecutor(threadFactory) } /** diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java index 4763395d7d401..2474d3d8bcf0e 100644 --- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java @@ -125,6 +125,7 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th Object[] args = invocationOnMock.getArguments(); return new DiskBlockObjectWriter( + (BlockId) args[0], (File) args[1], (SerializerInstance) args[2], (Integer) args[3], diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java index 11c3a7be38875..c4397bd10f81b 100644 --- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java +++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java @@ -125,6 +125,7 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th Object[] args = invocationOnMock.getArguments(); return new DiskBlockObjectWriter( + (BlockId) args[0], (File) args[1], (SerializerInstance) args[2], (Integer) args[3], diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala index 3cd80c0f7d171..d3a4e9da10364 100644 --- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala +++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala @@ -19,10 +19,7 @@ package org.apache.spark import java.util.concurrent.{ExecutorService, TimeUnit} -import scala.collection.Map import scala.collection.mutable -import scala.concurrent.Await -import scala.concurrent.duration._ import scala.language.postfixOps import org.scalatest.{BeforeAndAfterEach, PrivateMethodTester} @@ -99,18 +96,18 @@ class HeartbeatReceiverSuite test("normal heartbeat") { heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet) - addExecutorAndVerify(executorId1) - addExecutorAndVerify(executorId2) + heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null)) + heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null)) triggerHeartbeat(executorId1, executorShouldReregister = false) triggerHeartbeat(executorId2, executorShouldReregister = false) - val trackedExecutors = getTrackedExecutors + val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen()) assert(trackedExecutors.size === 2) assert(trackedExecutors.contains(executorId1)) assert(trackedExecutors.contains(executorId2)) } test("reregister if scheduler is not ready yet") { - addExecutorAndVerify(executorId1) + heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null)) // Task scheduler is not set yet in HeartbeatReceiver, so executors should reregister triggerHeartbeat(executorId1, executorShouldReregister = true) } @@ -119,20 +116,20 @@ class HeartbeatReceiverSuite heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet) // Received heartbeat from unknown executor, so we ask it to re-register triggerHeartbeat(executorId1, executorShouldReregister = true) - assert(getTrackedExecutors.isEmpty) + assert(heartbeatReceiver.invokePrivate(_executorLastSeen()).isEmpty) } test("reregister if heartbeat from removed executor") { heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet) - addExecutorAndVerify(executorId1) - addExecutorAndVerify(executorId2) + heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null)) + heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null)) // Remove the second executor but not the first - removeExecutorAndVerify(executorId2) + heartbeatReceiver.onExecutorRemoved(SparkListenerExecutorRemoved(0, executorId2, "bad boy")) // Now trigger the heartbeats // A heartbeat from the second executor should require reregistering triggerHeartbeat(executorId1, executorShouldReregister = false) triggerHeartbeat(executorId2, executorShouldReregister = true) - val trackedExecutors = getTrackedExecutors + val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen()) assert(trackedExecutors.size === 1) assert(trackedExecutors.contains(executorId1)) assert(!trackedExecutors.contains(executorId2)) @@ -141,8 +138,8 @@ class HeartbeatReceiverSuite test("expire dead hosts") { val executorTimeout = heartbeatReceiver.invokePrivate(_executorTimeoutMs()) heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet) - addExecutorAndVerify(executorId1) - addExecutorAndVerify(executorId2) + heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null)) + heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null)) triggerHeartbeat(executorId1, executorShouldReregister = false) triggerHeartbeat(executorId2, executorShouldReregister = false) // Advance the clock and only trigger a heartbeat for the first executor @@ -152,7 +149,7 @@ class HeartbeatReceiverSuite heartbeatReceiverRef.askWithRetry[Boolean](ExpireDeadHosts) // Only the second executor should be expired as a dead host verify(scheduler).executorLost(Matchers.eq(executorId2), any()) - val trackedExecutors = getTrackedExecutors + val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen()) assert(trackedExecutors.size === 1) assert(trackedExecutors.contains(executorId1)) assert(!trackedExecutors.contains(executorId2)) @@ -178,8 +175,8 @@ class HeartbeatReceiverSuite fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisterExecutorResponse]( RegisterExecutor(executorId2, dummyExecutorEndpointRef2, "dummy:4040", 0, Map.empty)) heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet) - addExecutorAndVerify(executorId1) - addExecutorAndVerify(executorId2) + heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null)) + heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null)) triggerHeartbeat(executorId1, executorShouldReregister = false) triggerHeartbeat(executorId2, executorShouldReregister = false) @@ -225,26 +222,6 @@ class HeartbeatReceiverSuite } } - private def addExecutorAndVerify(executorId: String): Unit = { - assert( - heartbeatReceiver.addExecutor(executorId).map { f => - Await.result(f, 10.seconds) - } === Some(true)) - } - - private def removeExecutorAndVerify(executorId: String): Unit = { - assert( - heartbeatReceiver.removeExecutor(executorId).map { f => - Await.result(f, 10.seconds) - } === Some(true)) - } - - private def getTrackedExecutors: Map[String, Long] = { - // We may receive undesired SparkListenerExecutorAdded from LocalBackend, so exclude it from - // the map. See SPARK-10800. - heartbeatReceiver.invokePrivate(_executorLastSeen()). - filterKeys(_ != SparkContext.DRIVER_IDENTIFIER) - } } // TODO: use these classes to add end-to-end tests for dynamic allocation! diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala index 7e70308bb360c..af4e68950f75a 100644 --- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala @@ -168,9 +168,10 @@ class MapOutputTrackerSuite extends SparkFunSuite { masterTracker.registerShuffle(10, 1) masterTracker.registerMapOutput(10, 0, MapStatus( BlockManagerId("88", "mph", 1000), Array.fill[Long](10)(0))) - val senderAddress = RpcAddress("localhost", 12345) + val sender = mock(classOf[RpcEndpointRef]) + when(sender.address).thenReturn(RpcAddress("localhost", 12345)) val rpcCallContext = mock(classOf[RpcCallContext]) - when(rpcCallContext.senderAddress).thenReturn(senderAddress) + when(rpcCallContext.sender).thenReturn(sender) masterEndpoint.receiveAndReply(rpcCallContext)(GetMapOutputStatuses(10)) verify(rpcCallContext).reply(any()) verify(rpcCallContext, never()).sendFailure(any()) @@ -197,9 +198,10 @@ class MapOutputTrackerSuite extends SparkFunSuite { masterTracker.registerMapOutput(20, i, new CompressedMapStatus( BlockManagerId("999", "mps", 1000), Array.fill[Long](4000000)(0))) } - val senderAddress = RpcAddress("localhost", 12345) + val sender = mock(classOf[RpcEndpointRef]) + when(sender.address).thenReturn(RpcAddress("localhost", 12345)) val rpcCallContext = mock(classOf[RpcCallContext]) - when(rpcCallContext.senderAddress).thenReturn(senderAddress) + when(rpcCallContext.sender).thenReturn(sender) masterEndpoint.receiveAndReply(rpcCallContext)(GetMapOutputStatuses(20)) verify(rpcCallContext, never()).reply(any()) verify(rpcCallContext).sendFailure(isA(classOf[SparkException])) diff --git a/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala b/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala index 2d14249855c9d..33270bec6247c 100644 --- a/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala +++ b/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala @@ -41,7 +41,6 @@ object SSLSampleConfigs { def sparkSSLConfig(): SparkConf = { val conf = new SparkConf(loadDefaults = false) - conf.set("spark.rpc", "akka") conf.set("spark.ssl.enabled", "true") conf.set("spark.ssl.keyStore", keyStorePath) conf.set("spark.ssl.keyStorePassword", "password") @@ -55,7 +54,6 @@ object SSLSampleConfigs { def sparkSSLConfigUntrusted(): SparkConf = { val conf = new SparkConf(loadDefaults = false) - conf.set("spark.rpc", "akka") conf.set("spark.ssl.enabled", "true") conf.set("spark.ssl.keyStore", untrustedKeyStorePath) conf.set("spark.ssl.keyStorePassword", "password") diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala index d145e78834b1b..0c63f1a8afe5a 100644 --- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala @@ -17,15 +17,10 @@ package org.apache.spark.deploy -import scala.concurrent.duration._ - import org.mockito.Mockito.{mock, when} import org.scalatest.BeforeAndAfterAll -import org.scalatest.concurrent.Eventually._ import org.apache.spark._ -import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState} -import org.apache.spark.deploy.master.ApplicationInfo import org.apache.spark.deploy.master.Master import org.apache.spark.deploy.worker.Worker import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv} @@ -61,10 +56,6 @@ class StandaloneDynamicAllocationSuite } master = makeMaster() workers = makeWorkers(10, 2048) - // Wait until all workers register with master successfully - eventually(timeout(60.seconds), interval(10.millis)) { - assert(getMasterState.workers.size === numWorkers) - } } override def afterAll(): Unit = { @@ -82,208 +73,167 @@ class StandaloneDynamicAllocationSuite test("dynamic allocation default behavior") { sc = new SparkContext(appConf) val appId = sc.applicationId - eventually(timeout(10.seconds), interval(10.millis)) { - val apps = getApplications() - assert(apps.size === 1) - assert(apps.head.id === appId) - assert(apps.head.executors.size === 2) - assert(apps.head.getExecutorLimit === Int.MaxValue) - } + assert(master.apps.size === 1) + assert(master.apps.head.id === appId) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.getExecutorLimit === Int.MaxValue) // kill all executors assert(killAllExecutors(sc)) - var apps = getApplications() - assert(apps.head.executors.size === 0) - assert(apps.head.getExecutorLimit === 0) + assert(master.apps.head.executors.size === 0) + assert(master.apps.head.getExecutorLimit === 0) // request 1 assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 1) - assert(apps.head.getExecutorLimit === 1) + assert(master.apps.head.executors.size === 1) + assert(master.apps.head.getExecutorLimit === 1) // request 1 more assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 2) - assert(apps.head.getExecutorLimit === 2) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.getExecutorLimit === 2) // request 1 more; this one won't go through assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 2) - assert(apps.head.getExecutorLimit === 3) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.getExecutorLimit === 3) // kill all existing executors; we should end up with 3 - 2 = 1 executor assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 1) - assert(apps.head.getExecutorLimit === 1) + assert(master.apps.head.executors.size === 1) + assert(master.apps.head.getExecutorLimit === 1) // kill all executors again; this time we'll have 1 - 1 = 0 executors left assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 0) - assert(apps.head.getExecutorLimit === 0) + assert(master.apps.head.executors.size === 0) + assert(master.apps.head.getExecutorLimit === 0) // request many more; this increases the limit well beyond the cluster capacity assert(sc.requestExecutors(1000)) - apps = getApplications() - assert(apps.head.executors.size === 2) - assert(apps.head.getExecutorLimit === 1000) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.getExecutorLimit === 1000) } test("dynamic allocation with max cores <= cores per worker") { sc = new SparkContext(appConf.set("spark.cores.max", "8")) val appId = sc.applicationId - eventually(timeout(10.seconds), interval(10.millis)) { - val apps = getApplications() - assert(apps.size === 1) - assert(apps.head.id === appId) - assert(apps.head.executors.size === 2) - assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4)) - assert(apps.head.getExecutorLimit === Int.MaxValue) - } + assert(master.apps.size === 1) + assert(master.apps.head.id === appId) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.executors.values.map(_.cores).toArray === Array(4, 4)) + assert(master.apps.head.getExecutorLimit === Int.MaxValue) // kill all executors assert(killAllExecutors(sc)) - var apps = getApplications() - assert(apps.head.executors.size === 0) - assert(apps.head.getExecutorLimit === 0) + assert(master.apps.head.executors.size === 0) + assert(master.apps.head.getExecutorLimit === 0) // request 1 assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 1) - assert(apps.head.executors.values.head.cores === 8) - assert(apps.head.getExecutorLimit === 1) + assert(master.apps.head.executors.size === 1) + assert(master.apps.head.executors.values.head.cores === 8) + assert(master.apps.head.getExecutorLimit === 1) // request 1 more; this one won't go through because we're already at max cores. // This highlights a limitation of using dynamic allocation with max cores WITHOUT // setting cores per executor: once an application scales down and then scales back // up, its executors may not be spread out anymore! assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 1) - assert(apps.head.getExecutorLimit === 2) + assert(master.apps.head.executors.size === 1) + assert(master.apps.head.getExecutorLimit === 2) // request 1 more; this one also won't go through for the same reason assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 1) - assert(apps.head.getExecutorLimit === 3) + assert(master.apps.head.executors.size === 1) + assert(master.apps.head.getExecutorLimit === 3) // kill all existing executors; we should end up with 3 - 1 = 2 executor // Note: we scheduled these executors together, so their cores should be evenly distributed assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 2) - assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4)) - assert(apps.head.getExecutorLimit === 2) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.executors.values.map(_.cores).toArray === Array(4, 4)) + assert(master.apps.head.getExecutorLimit === 2) // kill all executors again; this time we'll have 1 - 1 = 0 executors left assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 0) - assert(apps.head.getExecutorLimit === 0) + assert(master.apps.head.executors.size === 0) + assert(master.apps.head.getExecutorLimit === 0) // request many more; this increases the limit well beyond the cluster capacity assert(sc.requestExecutors(1000)) - apps = getApplications() - assert(apps.head.executors.size === 2) - assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4)) - assert(apps.head.getExecutorLimit === 1000) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.executors.values.map(_.cores).toArray === Array(4, 4)) + assert(master.apps.head.getExecutorLimit === 1000) } test("dynamic allocation with max cores > cores per worker") { sc = new SparkContext(appConf.set("spark.cores.max", "16")) val appId = sc.applicationId - eventually(timeout(10.seconds), interval(10.millis)) { - val apps = getApplications() - assert(apps.size === 1) - assert(apps.head.id === appId) - assert(apps.head.executors.size === 2) - assert(apps.head.executors.values.map(_.cores).toArray === Array(8, 8)) - assert(apps.head.getExecutorLimit === Int.MaxValue) - } + assert(master.apps.size === 1) + assert(master.apps.head.id === appId) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.executors.values.map(_.cores).toArray === Array(8, 8)) + assert(master.apps.head.getExecutorLimit === Int.MaxValue) // kill all executors assert(killAllExecutors(sc)) - var apps = getApplications() - assert(apps.head.executors.size === 0) - assert(apps.head.getExecutorLimit === 0) + assert(master.apps.head.executors.size === 0) + assert(master.apps.head.getExecutorLimit === 0) // request 1 assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 1) - assert(apps.head.executors.values.head.cores === 10) - assert(apps.head.getExecutorLimit === 1) + assert(master.apps.head.executors.size === 1) + assert(master.apps.head.executors.values.head.cores === 10) + assert(master.apps.head.getExecutorLimit === 1) // request 1 more // Note: the cores are not evenly distributed because we scheduled these executors 1 by 1 assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 2) - assert(apps.head.executors.values.map(_.cores).toSet === Set(10, 6)) - assert(apps.head.getExecutorLimit === 2) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.executors.values.map(_.cores).toSet === Set(10, 6)) + assert(master.apps.head.getExecutorLimit === 2) // request 1 more; this one won't go through assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 2) - assert(apps.head.getExecutorLimit === 3) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.getExecutorLimit === 3) // kill all existing executors; we should end up with 3 - 2 = 1 executor assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 1) - assert(apps.head.executors.values.head.cores === 10) - assert(apps.head.getExecutorLimit === 1) + assert(master.apps.head.executors.size === 1) + assert(master.apps.head.executors.values.head.cores === 10) + assert(master.apps.head.getExecutorLimit === 1) // kill all executors again; this time we'll have 1 - 1 = 0 executors left assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 0) - assert(apps.head.getExecutorLimit === 0) + assert(master.apps.head.executors.size === 0) + assert(master.apps.head.getExecutorLimit === 0) // request many more; this increases the limit well beyond the cluster capacity assert(sc.requestExecutors(1000)) - apps = getApplications() - assert(apps.head.executors.size === 2) - assert(apps.head.executors.values.map(_.cores).toArray === Array(8, 8)) - assert(apps.head.getExecutorLimit === 1000) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.executors.values.map(_.cores).toArray === Array(8, 8)) + assert(master.apps.head.getExecutorLimit === 1000) } test("dynamic allocation with cores per executor") { sc = new SparkContext(appConf.set("spark.executor.cores", "2")) val appId = sc.applicationId - eventually(timeout(10.seconds), interval(10.millis)) { - val apps = getApplications() - assert(apps.size === 1) - assert(apps.head.id === appId) - assert(apps.head.executors.size === 10) // 20 cores total - assert(apps.head.getExecutorLimit === Int.MaxValue) - } + assert(master.apps.size === 1) + assert(master.apps.head.id === appId) + assert(master.apps.head.executors.size === 10) // 20 cores total + assert(master.apps.head.getExecutorLimit === Int.MaxValue) // kill all executors assert(killAllExecutors(sc)) - var apps = getApplications() - assert(apps.head.executors.size === 0) - assert(apps.head.getExecutorLimit === 0) + assert(master.apps.head.executors.size === 0) + assert(master.apps.head.getExecutorLimit === 0) // request 1 assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 1) - assert(apps.head.getExecutorLimit === 1) + assert(master.apps.head.executors.size === 1) + assert(master.apps.head.getExecutorLimit === 1) // request 3 more assert(sc.requestExecutors(3)) - apps = getApplications() - assert(apps.head.executors.size === 4) - assert(apps.head.getExecutorLimit === 4) + assert(master.apps.head.executors.size === 4) + assert(master.apps.head.getExecutorLimit === 4) // request 10 more; only 6 will go through assert(sc.requestExecutors(10)) - apps = getApplications() - assert(apps.head.executors.size === 10) - assert(apps.head.getExecutorLimit === 14) + assert(master.apps.head.executors.size === 10) + assert(master.apps.head.getExecutorLimit === 14) // kill 2 executors; we should get 2 back immediately assert(killNExecutors(sc, 2)) - apps = getApplications() - assert(apps.head.executors.size === 10) - assert(apps.head.getExecutorLimit === 12) + assert(master.apps.head.executors.size === 10) + assert(master.apps.head.getExecutorLimit === 12) // kill 4 executors; we should end up with 12 - 4 = 8 executors assert(killNExecutors(sc, 4)) - apps = getApplications() - assert(apps.head.executors.size === 8) - assert(apps.head.getExecutorLimit === 8) + assert(master.apps.head.executors.size === 8) + assert(master.apps.head.getExecutorLimit === 8) // kill all executors; this time we'll have 8 - 8 = 0 executors left assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 0) - assert(apps.head.getExecutorLimit === 0) + assert(master.apps.head.executors.size === 0) + assert(master.apps.head.getExecutorLimit === 0) // request many more; this increases the limit well beyond the cluster capacity assert(sc.requestExecutors(1000)) - apps = getApplications() - assert(apps.head.executors.size === 10) - assert(apps.head.getExecutorLimit === 1000) + assert(master.apps.head.executors.size === 10) + assert(master.apps.head.getExecutorLimit === 1000) } test("dynamic allocation with cores per executor AND max cores") { @@ -291,70 +241,55 @@ class StandaloneDynamicAllocationSuite .set("spark.executor.cores", "2") .set("spark.cores.max", "8")) val appId = sc.applicationId - eventually(timeout(10.seconds), interval(10.millis)) { - val apps = getApplications() - assert(apps.size === 1) - assert(apps.head.id === appId) - assert(apps.head.executors.size === 4) // 8 cores total - assert(apps.head.getExecutorLimit === Int.MaxValue) - } + assert(master.apps.size === 1) + assert(master.apps.head.id === appId) + assert(master.apps.head.executors.size === 4) // 8 cores total + assert(master.apps.head.getExecutorLimit === Int.MaxValue) // kill all executors assert(killAllExecutors(sc)) - var apps = getApplications() - assert(apps.head.executors.size === 0) - assert(apps.head.getExecutorLimit === 0) + assert(master.apps.head.executors.size === 0) + assert(master.apps.head.getExecutorLimit === 0) // request 1 assert(sc.requestExecutors(1)) - apps = getApplications() - assert(apps.head.executors.size === 1) - assert(apps.head.getExecutorLimit === 1) + assert(master.apps.head.executors.size === 1) + assert(master.apps.head.getExecutorLimit === 1) // request 3 more assert(sc.requestExecutors(3)) - apps = getApplications() - assert(apps.head.executors.size === 4) - assert(apps.head.getExecutorLimit === 4) + assert(master.apps.head.executors.size === 4) + assert(master.apps.head.getExecutorLimit === 4) // request 10 more; none will go through assert(sc.requestExecutors(10)) - apps = getApplications() - assert(apps.head.executors.size === 4) - assert(apps.head.getExecutorLimit === 14) + assert(master.apps.head.executors.size === 4) + assert(master.apps.head.getExecutorLimit === 14) // kill all executors; 4 executors will be launched immediately assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 4) - assert(apps.head.getExecutorLimit === 10) + assert(master.apps.head.executors.size === 4) + assert(master.apps.head.getExecutorLimit === 10) // ... and again assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 4) - assert(apps.head.getExecutorLimit === 6) + assert(master.apps.head.executors.size === 4) + assert(master.apps.head.getExecutorLimit === 6) // ... and again; now we end up with 6 - 4 = 2 executors left assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 2) - assert(apps.head.getExecutorLimit === 2) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.getExecutorLimit === 2) // ... and again; this time we have 2 - 2 = 0 executors left assert(killAllExecutors(sc)) - apps = getApplications() - assert(apps.head.executors.size === 0) - assert(apps.head.getExecutorLimit === 0) + assert(master.apps.head.executors.size === 0) + assert(master.apps.head.getExecutorLimit === 0) // request many more; this increases the limit well beyond the cluster capacity assert(sc.requestExecutors(1000)) - apps = getApplications() - assert(apps.head.executors.size === 4) - assert(apps.head.getExecutorLimit === 1000) + assert(master.apps.head.executors.size === 4) + assert(master.apps.head.getExecutorLimit === 1000) } test("kill the same executor twice (SPARK-9795)") { sc = new SparkContext(appConf) val appId = sc.applicationId - eventually(timeout(10.seconds), interval(10.millis)) { - val apps = getApplications() - assert(apps.size === 1) - assert(apps.head.id === appId) - assert(apps.head.executors.size === 2) - assert(apps.head.getExecutorLimit === Int.MaxValue) - } + assert(master.apps.size === 1) + assert(master.apps.head.id === appId) + assert(master.apps.head.executors.size === 2) + assert(master.apps.head.getExecutorLimit === Int.MaxValue) // sync executors between the Master and the driver, needed because // the driver refuses to kill executors it does not know about syncExecutors(sc) @@ -363,10 +298,9 @@ class StandaloneDynamicAllocationSuite assert(executors.size === 2) assert(sc.killExecutor(executors.head)) assert(sc.killExecutor(executors.head)) - val apps = getApplications() - assert(apps.head.executors.size === 1) + assert(master.apps.head.executors.size === 1) // The limit should not be lowered twice - assert(apps.head.getExecutorLimit === 1) + assert(master.apps.head.getExecutorLimit === 1) } test("the pending replacement executors should not be lost (SPARK-10515)") { @@ -434,16 +368,6 @@ class StandaloneDynamicAllocationSuite } } - /** Get the Master state */ - private def getMasterState: MasterStateResponse = { - master.self.askWithRetry[MasterStateResponse](RequestMasterState) - } - - /** Get the applictions that are active from Master */ - private def getApplications(): Seq[ApplicationInfo] = { - getMasterState.activeApps - } - /** Kill all executors belonging to this application. */ private def killAllExecutors(sc: SparkContext): Boolean = { killNExecutors(sc, Int.MaxValue) @@ -463,11 +387,8 @@ class StandaloneDynamicAllocationSuite * don't wait for executors to register. Otherwise the tests will take much longer to run. */ private def getExecutorIds(sc: SparkContext): Seq[String] = { - val app = getApplications().find(_.id == sc.applicationId) - assert(app.isDefined) - // Although executors is transient, master is in the same process so the message won't be - // serialized and it's safe here. - app.get.executors.keys.map(_.toString).toSeq + assert(master.idToApp.contains(sc.applicationId)) + master.idToApp(sc.applicationId).executors.keys.map(_.toString).toSeq } /** diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala deleted file mode 100644 index 34f27ecaa07a3..0000000000000 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.deploy.history - -import java.io.File -import java.nio.charset.StandardCharsets._ - -import com.google.common.io.Files - -import org.apache.spark._ -import org.apache.spark.util.Utils - -class HistoryServerArgumentsSuite extends SparkFunSuite { - - private val logDir = new File("src/test/resources/spark-events") - private val conf = new SparkConf() - .set("spark.history.fs.logDirectory", logDir.getAbsolutePath) - .set("spark.history.fs.updateInterval", "1") - .set("spark.testing", "true") - - test("No Arguments Parsing") { - val argStrings = Array[String]() - val hsa = new HistoryServerArguments(conf, argStrings) - assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath) - assert(conf.get("spark.history.fs.updateInterval") === "1") - assert(conf.get("spark.testing") === "true") - } - - test("Directory Arguments Parsing --dir or -d") { - val argStrings = Array("--dir", "src/test/resources/spark-events1") - val hsa = new HistoryServerArguments(conf, argStrings) - assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1") - } - - test("Directory Param can also be set directly") { - val argStrings = Array("src/test/resources/spark-events2") - val hsa = new HistoryServerArguments(conf, argStrings) - assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2") - } - - test("Properties File Arguments Parsing --properties-file") { - val tmpDir = Utils.createTempDir() - val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir) - try { - Files.write("spark.test.CustomPropertyA blah\n" + - "spark.test.CustomPropertyB notblah\n", outFile, UTF_8) - val argStrings = Array("--properties-file", outFile.getAbsolutePath) - val hsa = new HistoryServerArguments(conf, argStrings) - assert(conf.get("spark.test.CustomPropertyA") === "blah") - assert(conf.get("spark.test.CustomPropertyB") === "notblah") - } finally { - Utils.deleteRecursively(tmpDir) - } - } - -} diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala index 40c24bdecc6ce..e9034e39a715c 100644 --- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala @@ -26,7 +26,8 @@ class WorkerWatcherSuite extends SparkFunSuite { val conf = new SparkConf() val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf)) val targetWorkerUrl = rpcEnv.uriOf("test", RpcAddress("1.2.3.4", 1234), "Worker") - val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl, isTesting = true) + val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl) + workerWatcher.setTesting(testing = true) rpcEnv.setupEndpoint("worker-watcher", workerWatcher) workerWatcher.onDisconnected(RpcAddress("1.2.3.4", 1234)) assert(workerWatcher.isShutDown) @@ -38,7 +39,8 @@ class WorkerWatcherSuite extends SparkFunSuite { val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf)) val targetWorkerUrl = rpcEnv.uriOf("test", RpcAddress("1.2.3.4", 1234), "Worker") val otherRpcAddress = RpcAddress("4.3.2.1", 1234) - val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl, isTesting = true) + val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl) + workerWatcher.setTesting(testing = true) rpcEnv.setupEndpoint("worker-watcher", workerWatcher) workerWatcher.onDisconnected(otherRpcAddress) assert(!workerWatcher.isShutDown) diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala index 834e4743df866..27d00e8eef022 100644 --- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala +++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala @@ -17,7 +17,6 @@ package org.apache.spark.rpc -import java.io.NotSerializableException import java.util.concurrent.{TimeUnit, CountDownLatch, TimeoutException} import scala.collection.mutable @@ -100,6 +99,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll { } } val rpcEndpointRef = env.setupEndpoint("send-ref", endpoint) + val newRpcEndpointRef = rpcEndpointRef.askWithRetry[RpcEndpointRef]("Hello") val reply = newRpcEndpointRef.askWithRetry[String]("Echo") assert("Echo" === reply) @@ -328,6 +328,9 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll { override def onStop(): Unit = { selfOption = Option(self) } + + override def onError(cause: Throwable): Unit = { + } }) env.stop(endpointRef) @@ -549,12 +552,9 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll { "local", env.address, "sendWithReply-unserializable-error") try { val f = rpcEndpointRef.ask[String]("hello") - val e = intercept[Exception] { + intercept[TimeoutException] { Await.result(f, 1 seconds) } - assert(e.isInstanceOf[TimeoutException] || // For Akka - e.isInstanceOf[NotSerializableException] // For Netty - ) } finally { anotherEnv.shutdown() anotherEnv.awaitTermination() @@ -695,7 +695,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll { // once the future is complete to verify addMessageIfTimeout was invoked val reply3 = intercept[RpcTimeoutException] { - Await.result(fut3, 2000 millis) + Await.result(fut3, 200 millis) }.getMessage // When the future timed out, the recover callback should have used diff --git a/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala b/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala deleted file mode 100644 index 5e8da3e205ab0..0000000000000 --- a/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc - -import scala.collection.mutable.ArrayBuffer - -import org.scalactic.TripleEquals - -class TestRpcEndpoint extends ThreadSafeRpcEndpoint with TripleEquals { - - override val rpcEnv: RpcEnv = null - - @volatile private var receiveMessages = ArrayBuffer[Any]() - - @volatile private var receiveAndReplyMessages = ArrayBuffer[Any]() - - @volatile private var onConnectedMessages = ArrayBuffer[RpcAddress]() - - @volatile private var onDisconnectedMessages = ArrayBuffer[RpcAddress]() - - @volatile private var onNetworkErrorMessages = ArrayBuffer[(Throwable, RpcAddress)]() - - @volatile private var started = false - - @volatile private var stopped = false - - override def receive: PartialFunction[Any, Unit] = { - case message: Any => receiveMessages += message - } - - override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { - case message: Any => receiveAndReplyMessages += message - } - - override def onConnected(remoteAddress: RpcAddress): Unit = { - onConnectedMessages += remoteAddress - } - - /** - * Invoked when some network error happens in the connection between the current node and - * `remoteAddress`. - */ - override def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = { - onNetworkErrorMessages += cause -> remoteAddress - } - - override def onDisconnected(remoteAddress: RpcAddress): Unit = { - onDisconnectedMessages += remoteAddress - } - - def numReceiveMessages: Int = receiveMessages.size - - override def onStart(): Unit = { - started = true - } - - override def onStop(): Unit = { - stopped = true - } - - def verifyStarted(): Unit = { - assert(started, "RpcEndpoint is not started") - } - - def verifyStopped(): Unit = { - assert(stopped, "RpcEndpoint is not stopped") - } - - def verifyReceiveMessages(expected: Seq[Any]): Unit = { - assert(receiveMessages === expected) - } - - def verifySingleReceiveMessage(message: Any): Unit = { - verifyReceiveMessages(List(message)) - } - - def verifyReceiveAndReplyMessages(expected: Seq[Any]): Unit = { - assert(receiveAndReplyMessages === expected) - } - - def verifySingleReceiveAndReplyMessage(message: Any): Unit = { - verifyReceiveAndReplyMessages(List(message)) - } - - def verifySingleOnConnectedMessage(remoteAddress: RpcAddress): Unit = { - verifyOnConnectedMessages(List(remoteAddress)) - } - - def verifyOnConnectedMessages(expected: Seq[RpcAddress]): Unit = { - assert(onConnectedMessages === expected) - } - - def verifySingleOnDisconnectedMessage(remoteAddress: RpcAddress): Unit = { - verifyOnDisconnectedMessages(List(remoteAddress)) - } - - def verifyOnDisconnectedMessages(expected: Seq[RpcAddress]): Unit = { - assert(onDisconnectedMessages === expected) - } - - def verifySingleOnNetworkErrorMessage(cause: Throwable, remoteAddress: RpcAddress): Unit = { - verifyOnNetworkErrorMessages(List(cause -> remoteAddress)) - } - - def verifyOnNetworkErrorMessages(expected: Seq[(Throwable, RpcAddress)]): Unit = { - assert(onNetworkErrorMessages === expected) - } -} diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala deleted file mode 100644 index 276c077b3d13e..0000000000000 --- a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc.netty - -import java.util.concurrent.{CountDownLatch, TimeUnit} -import java.util.concurrent.atomic.AtomicInteger - -import org.mockito.Mockito._ - -import org.apache.spark.SparkFunSuite -import org.apache.spark.rpc.{RpcEnv, RpcEndpoint, RpcAddress, TestRpcEndpoint} - -class InboxSuite extends SparkFunSuite { - - test("post") { - val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) - when(endpointRef.name).thenReturn("hello") - - val dispatcher = mock(classOf[Dispatcher]) - - val inbox = new Inbox(endpointRef, endpoint) - val message = ContentMessage(null, "hi", false, null) - inbox.post(message) - inbox.process(dispatcher) - assert(inbox.isEmpty) - - endpoint.verifySingleReceiveMessage("hi") - - inbox.stop() - inbox.process(dispatcher) - assert(inbox.isEmpty) - endpoint.verifyStarted() - endpoint.verifyStopped() - } - - test("post: with reply") { - val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) - val dispatcher = mock(classOf[Dispatcher]) - - val inbox = new Inbox(endpointRef, endpoint) - val message = ContentMessage(null, "hi", true, null) - inbox.post(message) - inbox.process(dispatcher) - assert(inbox.isEmpty) - - endpoint.verifySingleReceiveAndReplyMessage("hi") - } - - test("post: multiple threads") { - val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) - when(endpointRef.name).thenReturn("hello") - - val dispatcher = mock(classOf[Dispatcher]) - - val numDroppedMessages = new AtomicInteger(0) - val inbox = new Inbox(endpointRef, endpoint) { - override def onDrop(message: InboxMessage): Unit = { - numDroppedMessages.incrementAndGet() - } - } - - val exitLatch = new CountDownLatch(10) - - for (_ <- 0 until 10) { - new Thread { - override def run(): Unit = { - for (_ <- 0 until 100) { - val message = ContentMessage(null, "hi", false, null) - inbox.post(message) - } - exitLatch.countDown() - } - }.start() - } - // Try to process some messages - inbox.process(dispatcher) - inbox.stop() - // After `stop` is called, further messages will be dropped. However, while `stop` is called, - // some messages may be post to Inbox, so process them here. - inbox.process(dispatcher) - assert(inbox.isEmpty) - - exitLatch.await(30, TimeUnit.SECONDS) - - assert(1000 === endpoint.numReceiveMessages + numDroppedMessages.get) - endpoint.verifyStarted() - endpoint.verifyStopped() - } - - test("post: Associated") { - val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) - val dispatcher = mock(classOf[Dispatcher]) - - val remoteAddress = RpcAddress("localhost", 11111) - - val inbox = new Inbox(endpointRef, endpoint) - inbox.post(RemoteProcessConnected(remoteAddress)) - inbox.process(dispatcher) - - endpoint.verifySingleOnConnectedMessage(remoteAddress) - } - - test("post: Disassociated") { - val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) - val dispatcher = mock(classOf[Dispatcher]) - - val remoteAddress = RpcAddress("localhost", 11111) - - val inbox = new Inbox(endpointRef, endpoint) - inbox.post(RemoteProcessDisconnected(remoteAddress)) - inbox.process(dispatcher) - - endpoint.verifySingleOnDisconnectedMessage(remoteAddress) - } - - test("post: AssociationError") { - val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) - val dispatcher = mock(classOf[Dispatcher]) - - val remoteAddress = RpcAddress("localhost", 11111) - val cause = new RuntimeException("Oops") - - val inbox = new Inbox(endpointRef, endpoint) - inbox.post(RemoteProcessConnectionError(cause, remoteAddress)) - inbox.process(dispatcher) - - endpoint.verifySingleOnNetworkErrorMessage(cause, remoteAddress) - } -} diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala deleted file mode 100644 index 56743ba650b41..0000000000000 --- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc.netty - -import org.apache.spark.SparkFunSuite - -class NettyRpcAddressSuite extends SparkFunSuite { - - test("toString") { - val addr = new RpcEndpointAddress("localhost", 12345, "test") - assert(addr.toString === "spark://test@localhost:12345") - } - - test("toString for client mode") { - val addr = RpcEndpointAddress(null, "test") - assert(addr.toString === "spark-client://test") - } - -} diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala deleted file mode 100644 index ce83087ec04d6..0000000000000 --- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc.netty - -import org.apache.spark.{SecurityManager, SparkConf} -import org.apache.spark.rpc._ - -class NettyRpcEnvSuite extends RpcEnvSuite { - - override def createRpcEnv( - conf: SparkConf, - name: String, - port: Int, - clientMode: Boolean = false): RpcEnv = { - val config = RpcEnvConfig(conf, "test", "localhost", port, new SecurityManager(conf), - clientMode) - new NettyRpcEnvFactory().create(config) - } - - test("non-existent endpoint") { - val uri = env.uriOf("test", env.address, "nonexist-endpoint") - val e = intercept[RpcEndpointNotFoundException] { - env.setupEndpointRef("test", env.address, "nonexist-endpoint") - } - assert(e.getMessage.contains(uri)) - } - -} diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala deleted file mode 100644 index f9d8e80c98b66..0000000000000 --- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rpc.netty - -import java.net.InetSocketAddress - -import io.netty.channel.Channel -import org.mockito.Mockito._ -import org.mockito.Matchers._ - -import org.apache.spark.SparkFunSuite -import org.apache.spark.network.client.{TransportResponseHandler, TransportClient} -import org.apache.spark.rpc._ - -class NettyRpcHandlerSuite extends SparkFunSuite { - - val env = mock(classOf[NettyRpcEnv]) - when(env.deserialize(any(classOf[TransportClient]), any(classOf[Array[Byte]]))(any())). - thenReturn(RequestMessage(RpcAddress("localhost", 12345), null, null, false)) - - test("receive") { - val dispatcher = mock(classOf[Dispatcher]) - val nettyRpcHandler = new NettyRpcHandler(dispatcher, env) - - val channel = mock(classOf[Channel]) - val client = new TransportClient(channel, mock(classOf[TransportResponseHandler])) - when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000)) - nettyRpcHandler.receive(client, null, null) - - verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 40000))) - } - - test("connectionTerminated") { - val dispatcher = mock(classOf[Dispatcher]) - val nettyRpcHandler = new NettyRpcHandler(dispatcher, env) - - val channel = mock(classOf[Channel]) - val client = new TransportClient(channel, mock(classOf[TransportResponseHandler])) - when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000)) - nettyRpcHandler.receive(client, null, null) - - when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000)) - nettyRpcHandler.connectionTerminated(client) - - verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 40000))) - verify(dispatcher, times(1)).postToAll( - RemoteProcessDisconnected(RpcAddress("localhost", 40000))) - } - -} diff --git a/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala index e0f474aa505c1..3fe28027c3c21 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.scheduler +import org.apache.spark.rdd.{ShuffledRDDPartition, RDD, ShuffledRDD} import org.apache.spark._ object AdaptiveSchedulingSuiteState { @@ -27,10 +28,26 @@ object AdaptiveSchedulingSuiteState { } } +/** A special ShuffledRDD where we can pass a ShuffleDependency object to use */ +class CustomShuffledRDD[K, V, C](@transient dep: ShuffleDependency[K, V, C]) + extends RDD[(K, C)](dep.rdd.context, Seq(dep)) { + + override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { + val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] + SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) + .read() + .asInstanceOf[Iterator[(K, C)]] + } + + override def getPartitions: Array[Partition] = { + Array.tabulate[Partition](dep.partitioner.numPartitions)(i => new ShuffledRDDPartition(i)) + } +} + class AdaptiveSchedulingSuite extends SparkFunSuite with LocalSparkContext { test("simple use of submitMapStage") { try { - sc = new SparkContext("local", "test") + sc = new SparkContext("local[1,2]", "test") val rdd = sc.parallelize(1 to 3, 3).map { x => AdaptiveSchedulingSuiteState.tasksRun += 1 (x, x) @@ -45,32 +62,4 @@ class AdaptiveSchedulingSuite extends SparkFunSuite with LocalSparkContext { AdaptiveSchedulingSuiteState.clear() } } - - test("fetching multiple map output partitions per reduce") { - sc = new SparkContext("local", "test") - val rdd = sc.parallelize(0 to 2, 3).map(x => (x, x)) - val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(3)) - val shuffled = new CustomShuffledRDD[Int, Int, Int](dep, Array(0, 2)) - assert(shuffled.partitions.length === 2) - assert(shuffled.glom().map(_.toSet).collect().toSet == Set(Set((0, 0), (1, 1)), Set((2, 2)))) - } - - test("fetching all map output partitions in one reduce") { - sc = new SparkContext("local", "test") - val rdd = sc.parallelize(0 to 2, 3).map(x => (x, x)) - // Also create lots of hash partitions so that some of them are empty - val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(5)) - val shuffled = new CustomShuffledRDD[Int, Int, Int](dep, Array(0)) - assert(shuffled.partitions.length === 1) - assert(shuffled.collect().toSet == Set((0, 0), (1, 1), (2, 2))) - } - - test("more reduce tasks than map output partitions") { - sc = new SparkContext("local", "test") - val rdd = sc.parallelize(0 to 2, 3).map(x => (x, x)) - val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(3)) - val shuffled = new CustomShuffledRDD[Int, Int, Int](dep, Array(0, 0, 0, 1, 1, 1, 2)) - assert(shuffled.partitions.length === 7) - assert(shuffled.collect().toSet == Set((0, 0), (1, 1), (2, 2))) - } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala b/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala deleted file mode 100644 index d8d818ceed45f..0000000000000 --- a/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.scheduler - -import java.util.Arrays - -import org.apache.spark._ -import org.apache.spark.rdd.RDD - -/** - * A Partitioner that might group together one or more partitions from the parent. - * - * @param parent a parent partitioner - * @param partitionStartIndices indices of partitions in parent that should create new partitions - * in child (this should be an array of increasing partition IDs). For example, if we have a - * parent with 5 partitions, and partitionStartIndices is [0, 2, 4], we get three output - * partitions, corresponding to partition ranges [0, 1], [2, 3] and [4] of the parent partitioner. - */ -class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: Array[Int]) - extends Partitioner { - - @transient private lazy val parentPartitionMapping: Array[Int] = { - val n = parent.numPartitions - val result = new Array[Int](n) - for (i <- 0 until partitionStartIndices.length) { - val start = partitionStartIndices(i) - val end = if (i < partitionStartIndices.length - 1) partitionStartIndices(i + 1) else n - for (j <- start until end) { - result(j) = i - } - } - result - } - - override def numPartitions: Int = partitionStartIndices.size - - override def getPartition(key: Any): Int = { - parentPartitionMapping(parent.getPartition(key)) - } - - override def equals(other: Any): Boolean = other match { - case c: CoalescedPartitioner => - c.parent == parent && Arrays.equals(c.partitionStartIndices, partitionStartIndices) - case _ => - false - } -} - -private[spark] class CustomShuffledRDDPartition( - val index: Int, val startIndexInParent: Int, val endIndexInParent: Int) - extends Partition { - - override def hashCode(): Int = index -} - -/** - * A special ShuffledRDD that supports a ShuffleDependency object from outside and launching reduce - * tasks that read multiple map output partitions. - */ -class CustomShuffledRDD[K, V, C]( - var dependency: ShuffleDependency[K, V, C], - partitionStartIndices: Array[Int]) - extends RDD[(K, C)](dependency.rdd.context, Seq(dependency)) { - - def this(dep: ShuffleDependency[K, V, C]) = { - this(dep, (0 until dep.partitioner.numPartitions).toArray) - } - - override def getDependencies: Seq[Dependency[_]] = List(dependency) - - override val partitioner = { - Some(new CoalescedPartitioner(dependency.partitioner, partitionStartIndices)) - } - - override def getPartitions: Array[Partition] = { - val n = dependency.partitioner.numPartitions - Array.tabulate[Partition](partitionStartIndices.length) { i => - val startIndex = partitionStartIndices(i) - val endIndex = if (i < partitionStartIndices.length - 1) partitionStartIndices(i + 1) else n - new CustomShuffledRDDPartition(i, startIndex, endIndex) - } - } - - override def compute(p: Partition, context: TaskContext): Iterator[(K, C)] = { - val part = p.asInstanceOf[CustomShuffledRDDPartition] - SparkEnv.get.shuffleManager.getReader( - dependency.shuffleHandle, part.startIndexInParent, part.endIndexInParent, context) - .read() - .asInstanceOf[Iterator[(K, C)]] - } - - override def clearDependencies() { - super.clearDependencies() - dependency = null - } -} diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 3816b8c4a09aa..883dc3e6f726c 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -49,39 +49,19 @@ class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler) * An RDD for passing to DAGScheduler. These RDDs will use the dependencies and * preferredLocations (if any) that are passed to them. They are deliberately not executable * so we can test that DAGScheduler does not try to execute RDDs locally. - * - * Optionally, one can pass in a list of locations to use as preferred locations for each task, - * and a MapOutputTrackerMaster to enable reduce task locality. We pass the tracker separately - * because, in this test suite, it won't be the same as sc.env.mapOutputTracker. */ class MyRDD( sc: SparkContext, numPartitions: Int, dependencies: List[Dependency[_]], - locations: Seq[Seq[String]] = Nil, - @transient tracker: MapOutputTrackerMaster = null) - extends RDD[(Int, Int)](sc, dependencies) with Serializable { - + locations: Seq[Seq[String]] = Nil) extends RDD[(Int, Int)](sc, dependencies) with Serializable { override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] = throw new RuntimeException("should not be reached") - override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition { override def index: Int = i }).toArray - - override def getPreferredLocations(partition: Partition): Seq[String] = { - if (locations.isDefinedAt(partition.index)) { - locations(partition.index) - } else if (tracker != null && dependencies.size == 1 && - dependencies(0).isInstanceOf[ShuffleDependency[_, _, _]]) { - // If we have only one shuffle dependency, use the same code path as ShuffledRDD for locality - val dep = dependencies(0).asInstanceOf[ShuffleDependency[_, _, _]] - tracker.getPreferredLocationsForShuffle(dep, partition.index) - } else { - Nil - } - } - + override def getPreferredLocations(split: Partition): Seq[String] = + if (locations.isDefinedAt(split.index)) locations(split.index) else Nil override def toString: String = "DAGSchedulerSuiteRDD " + id } @@ -371,8 +351,7 @@ class DAGSchedulerSuite */ test("getMissingParentStages should consider all ancestor RDDs' cache statuses") { val rddA = new MyRDD(sc, 1, Nil) - val rddB = new MyRDD(sc, 1, List(new ShuffleDependency(rddA, new HashPartitioner(1))), - tracker = mapOutputTracker) + val rddB = new MyRDD(sc, 1, List(new ShuffleDependency(rddA, null))) val rddC = new MyRDD(sc, 1, List(new OneToOneDependency(rddB))).cache() val rddD = new MyRDD(sc, 1, List(new OneToOneDependency(rddC))) cacheLocations(rddC.id -> 0) = @@ -479,9 +458,9 @@ class DAGSchedulerSuite test("run trivial shuffle") { val shuffleMapRdd = new MyRDD(sc, 2, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1)) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 1, List(shuffleDep)) submit(reduceRdd, Array(0)) complete(taskSets(0), Seq( (Success, makeMapStatus("hostA", 1)), @@ -495,9 +474,9 @@ class DAGSchedulerSuite test("run trivial shuffle with fetch failure") { val shuffleMapRdd = new MyRDD(sc, 2, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) submit(reduceRdd, Array(0, 1)) complete(taskSets(0), Seq( (Success, makeMapStatus("hostA", reduceRdd.partitions.length)), @@ -611,8 +590,9 @@ class DAGSchedulerSuite val parts = 8 val shuffleMapRdd = new MyRDD(sc, parts, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(parts)) - val reduceRdd = new MyRDD(sc, parts, List(shuffleDep), tracker = mapOutputTracker) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) + val shuffleId = shuffleDep.shuffleId + val reduceRdd = new MyRDD(sc, parts, List(shuffleDep)) submit(reduceRdd, (0 until parts).toArray) completeShuffleMapStageSuccessfully(0, 0, numShufflePartitions = parts) @@ -645,8 +625,9 @@ class DAGSchedulerSuite setupStageAbortTest(sc) val shuffleMapRdd = new MyRDD(sc, 2, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) - val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) + val shuffleId = shuffleDep.shuffleId + val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) submit(reduceRdd, Array(0, 1)) for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES) { @@ -687,10 +668,10 @@ class DAGSchedulerSuite setupStageAbortTest(sc) val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache() - val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2)) - val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker).cache() - val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1)) - val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker) + val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null) + val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache() + val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null) + val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo)) submit(finalRdd, Array(0)) // In the first two iterations, Stage 0 succeeds and stage 1 fails. In the next two iterations, @@ -736,10 +717,10 @@ class DAGSchedulerSuite setupStageAbortTest(sc) val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache() - val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2)) - val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker).cache() - val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1)) - val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker) + val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null) + val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache() + val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null) + val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo)) submit(finalRdd, Array(0)) // First, execute stages 0 and 1, failing stage 1 up to MAX-1 times. @@ -796,9 +777,9 @@ class DAGSchedulerSuite test("trivial shuffle with multiple fetch failures") { val shuffleMapRdd = new MyRDD(sc, 2, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) submit(reduceRdd, Array(0, 1)) complete(taskSets(0), Seq( (Success, makeMapStatus("hostA", reduceRdd.partitions.length)), @@ -837,9 +818,9 @@ class DAGSchedulerSuite */ test("late fetch failures don't cause multiple concurrent attempts for the same map stage") { val shuffleMapRdd = new MyRDD(sc, 2, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) submit(reduceRdd, Array(0, 1)) val mapStageId = 0 @@ -905,9 +886,9 @@ class DAGSchedulerSuite test("extremely late fetch failures don't cause multiple concurrent attempts for " + "the same stage") { val shuffleMapRdd = new MyRDD(sc, 2, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) submit(reduceRdd, Array(0, 1)) def countSubmittedReduceStageAttempts(): Int = { @@ -968,9 +949,9 @@ class DAGSchedulerSuite test("ignore late map task completions") { val shuffleMapRdd = new MyRDD(sc, 2, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) submit(reduceRdd, Array(0, 1)) // pretend we were told hostA went away @@ -1037,8 +1018,8 @@ class DAGSchedulerSuite test("run shuffle with map stage failure") { val shuffleMapRdd = new MyRDD(sc, 2, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) - val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) + val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) submit(reduceRdd, Array(0, 1)) // Fail the map stage. This should cause the entire job to fail. @@ -1240,12 +1221,12 @@ class DAGSchedulerSuite */ test("failure of stage used by two jobs") { val shuffleMapRdd1 = new MyRDD(sc, 2, Nil) - val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, new HashPartitioner(2)) + val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, null) val shuffleMapRdd2 = new MyRDD(sc, 2, Nil) - val shuffleDep2 = new ShuffleDependency(shuffleMapRdd2, new HashPartitioner(2)) + val shuffleDep2 = new ShuffleDependency(shuffleMapRdd2, null) - val reduceRdd1 = new MyRDD(sc, 2, List(shuffleDep1), tracker = mapOutputTracker) - val reduceRdd2 = new MyRDD(sc, 2, List(shuffleDep1, shuffleDep2), tracker = mapOutputTracker) + val reduceRdd1 = new MyRDD(sc, 2, List(shuffleDep1)) + val reduceRdd2 = new MyRDD(sc, 2, List(shuffleDep1, shuffleDep2)) // We need to make our own listeners for this test, since by default submit uses the same // listener for all jobs, and here we want to capture the failure for each job separately. @@ -1277,9 +1258,9 @@ class DAGSchedulerSuite test("run trivial shuffle with out-of-band failure and retry") { val shuffleMapRdd = new MyRDD(sc, 2, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 1, List(shuffleDep)) submit(reduceRdd, Array(0)) // blockManagerMaster.removeExecutor("exec-hostA") // pretend we were told hostA went away @@ -1300,10 +1281,10 @@ class DAGSchedulerSuite test("recursive shuffle failures") { val shuffleOneRdd = new MyRDD(sc, 2, Nil) - val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2)) - val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker) - val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1)) - val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker) + val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null) + val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)) + val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null) + val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo)) submit(finalRdd, Array(0)) // have the first stage complete normally complete(taskSets(0), Seq( @@ -1329,10 +1310,10 @@ class DAGSchedulerSuite test("cached post-shuffle") { val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache() - val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2)) - val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker).cache() - val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1)) - val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker) + val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null) + val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache() + val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null) + val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo)) submit(finalRdd, Array(0)) cacheLocations(shuffleTwoRdd.id -> 0) = Seq(makeBlockManagerId("hostD")) cacheLocations(shuffleTwoRdd.id -> 1) = Seq(makeBlockManagerId("hostC")) @@ -1447,9 +1428,9 @@ class DAGSchedulerSuite test("reduce tasks should be placed locally with map output") { // Create an shuffleMapRdd with 1 partition val shuffleMapRdd = new MyRDD(sc, 1, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 1, List(shuffleDep)) submit(reduceRdd, Array(0)) complete(taskSets(0), Seq( (Success, makeMapStatus("hostA", 1)))) @@ -1468,9 +1449,9 @@ class DAGSchedulerSuite val numMapTasks = 4 // Create an shuffleMapRdd with more partitions val shuffleMapRdd = new MyRDD(sc, numMapTasks, Nil) - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1)) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 1, List(shuffleDep)) submit(reduceRdd, Array(0)) val statuses = (1 to numMapTasks).map { i => @@ -1492,10 +1473,10 @@ class DAGSchedulerSuite // Create an RDD that has both a shuffle dependency and a narrow dependency (e.g. for a join) val rdd1 = new MyRDD(sc, 1, Nil) val rdd2 = new MyRDD(sc, 1, Nil, locations = Seq(Seq("hostB"))) - val shuffleDep = new ShuffleDependency(rdd1, new HashPartitioner(1)) + val shuffleDep = new ShuffleDependency(rdd1, null) val narrowDep = new OneToOneDependency(rdd2) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 1, List(shuffleDep, narrowDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 1, List(shuffleDep, narrowDep)) submit(reduceRdd, Array(0)) complete(taskSets(0), Seq( (Success, makeMapStatus("hostA", 1)))) @@ -1528,8 +1509,7 @@ class DAGSchedulerSuite test("simple map stage submission") { val shuffleMapRdd = new MyRDD(sc, 2, Nil) val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1)) - val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 1, List(shuffleDep)) // Submit a map stage by itself submitMapStage(shuffleDep) @@ -1555,8 +1535,7 @@ class DAGSchedulerSuite test("map stage submission with reduce stage also depending on the data") { val shuffleMapRdd = new MyRDD(sc, 2, Nil) val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1)) - val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 1, List(shuffleDep)) // Submit the map stage by itself submitMapStage(shuffleDep) @@ -1585,7 +1564,7 @@ class DAGSchedulerSuite val shuffleMapRdd = new MyRDD(sc, 2, Nil) val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) val shuffleId = shuffleDep.shuffleId - val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) + val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) // Submit a map stage by itself submitMapStage(shuffleDep) @@ -1634,9 +1613,9 @@ class DAGSchedulerSuite test("map stage submission with multiple shared stages and failures") { val rdd1 = new MyRDD(sc, 2, Nil) val dep1 = new ShuffleDependency(rdd1, new HashPartitioner(2)) - val rdd2 = new MyRDD(sc, 2, List(dep1), tracker = mapOutputTracker) + val rdd2 = new MyRDD(sc, 2, List(dep1)) val dep2 = new ShuffleDependency(rdd2, new HashPartitioner(2)) - val rdd3 = new MyRDD(sc, 2, List(dep2), tracker = mapOutputTracker) + val rdd3 = new MyRDD(sc, 2, List(dep2)) val listener1 = new SimpleListener val listener2 = new SimpleListener @@ -1742,7 +1721,7 @@ class DAGSchedulerSuite assertDataStructuresEmpty() // Also test that a reduce stage using this shuffled data can immediately run - val reduceRDD = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) + val reduceRDD = new MyRDD(sc, 2, List(shuffleDep)) results.clear() submit(reduceRDD, Array(0, 1)) complete(taskSets(2), Seq((Success, 42), (Success, 43))) diff --git a/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala index 26a372d6a905d..a5eafb1b5529e 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala @@ -114,7 +114,7 @@ class BlockStoreShuffleReaderSuite extends SparkFunSuite with LocalSparkContext // Make a mocked MapOutputTracker for the shuffle reader to use to determine what // shuffle data to read. val mapOutputTracker = mock(classOf[MapOutputTracker]) - when(mapOutputTracker.getMapSizesByExecutorId(shuffleId, reduceId, reduceId + 1)).thenReturn { + when(mapOutputTracker.getMapSizesByExecutorId(shuffleId, reduceId)).thenReturn { // Test a scenario where all data is local, to avoid creating a bunch of additional mocks // for the code to read data over the network. val shuffleBlockIdsAndSizes = (0 until numMaps).map { mapId => diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala index b92a302806f76..6f9754fa66a5a 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala @@ -79,6 +79,7 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte override def answer(invocation: InvocationOnMock): DiskBlockObjectWriter = { val args = invocation.getArguments new DiskBlockObjectWriter( + args(0).asInstanceOf[BlockId], args(1).asInstanceOf[File], args(2).asInstanceOf[SerializerInstance], args(3).asInstanceOf[Int], diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala index 7c19531c18802..66af6e1a79740 100644 --- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala @@ -20,6 +20,7 @@ import java.io.File import org.scalatest.BeforeAndAfterEach +import org.apache.spark.SparkConf import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.serializer.JavaSerializer @@ -40,8 +41,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { test("verify write metrics") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() - val writer = new DiskBlockObjectWriter( - file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) + val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, + new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) writer.write(Long.box(20), Long.box(30)) // Record metrics update on every write @@ -62,8 +63,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { test("verify write metrics on revert") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() - val writer = new DiskBlockObjectWriter( - file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) + val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, + new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) writer.write(Long.box(20), Long.box(30)) // Record metrics update on every write @@ -85,8 +86,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { test("Reopening a closed block writer") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() - val writer = new DiskBlockObjectWriter( - file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) + val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, + new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) writer.open() writer.close() @@ -98,8 +99,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { test("calling revertPartialWritesAndClose() on a closed block writer should have no effect") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() - val writer = new DiskBlockObjectWriter( - file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) + val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, + new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) for (i <- 1 to 1000) { writer.write(i, i) } @@ -114,8 +115,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { test("commitAndClose() should be idempotent") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() - val writer = new DiskBlockObjectWriter( - file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) + val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, + new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) for (i <- 1 to 1000) { writer.write(i, i) } @@ -132,8 +133,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { test("revertPartialWritesAndClose() should be idempotent") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() - val writer = new DiskBlockObjectWriter( - file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) + val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, + new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) for (i <- 1 to 1000) { writer.write(i, i) } @@ -150,8 +151,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { test("fileSegment() can only be called after commitAndClose() has been called") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() - val writer = new DiskBlockObjectWriter( - file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) + val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, + new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) for (i <- 1 to 1000) { writer.write(i, i) } @@ -164,8 +165,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { test("commitAndClose() without ever opening or writing") { val file = new File(tempDir, "somefile") val writeMetrics = new ShuffleWriteMetrics() - val writer = new DiskBlockObjectWriter( - file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) + val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, + new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) writer.commitAndClose() assert(writer.fileSegment().length === 0) } diff --git a/docs/configuration.md b/docs/configuration.md index c276e8e90decf..3d3153832e6e0 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1542,14 +1542,6 @@ Apart from these, the following properties are also available, and may be useful higher memory usage in Spark. - - spark.streaming.stopGracefullyOnShutdown - false - - If true, Spark shuts down the StreamingContext gracefully on JVM - shutdown rather than immediately. - - spark.streaming.kafka.maxRatePerPartition not set diff --git a/docs/ml-features.md b/docs/ml-features.md index 142afac2f3f95..f0c510fff1e68 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -28,6 +28,7 @@ The algorithm combines Term Frequency (TF) counts with the [hashing trick](http: **IDF**: `IDF` is an `Estimator` which fits on a dataset and produces an `IDFModel`. The `IDFModel` takes feature vectors (generally created from `HashingTF`) and scales each column. Intuitively, it down-weights columns which appear frequently in a corpus. Please refer to the [MLlib user guide on TF-IDF](mllib-feature-extraction.html#tf-idf) for more details on Term Frequency and Inverse Document Frequency. +For API details, refer to the [HashingTF API docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and the [IDF API docs](api/scala/index.html#org.apache.spark.ml.feature.IDF). In the following code segment, we start with a set of sentences. We split each sentence into words using `Tokenizer`. For each sentence (bag of words), we use `HashingTF` to hash the sentence into a feature vector. We use `IDF` to rescale the feature vectors; this generally improves performance when using text as features. Our feature vectors could then be passed to a learning algorithm. @@ -165,11 +166,6 @@ for more details on the API.
- -Refer to the [Tokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) -and the [RegexTokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) -for more details on the API. - {% highlight scala %} import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer} @@ -192,11 +188,6 @@ regexTokenized.select("words", "label").take(3).foreach(println)
- -Refer to the [Tokenizer Java docs](api/java/org/apache/spark/ml/feature/Tokenizer.html) -and the [RegexTokenizer Java docs](api/java/org/apache/spark/ml/feature/RegexTokenizer.html) -for more details on the API. - {% highlight java %} import java.util.Arrays; @@ -238,11 +229,6 @@ RegexTokenizer regexTokenizer = new RegexTokenizer()
- -Refer to the [Tokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Tokenizer) and -the the [RegexTokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RegexTokenizer) -for more details on the API. - {% highlight python %} from pyspark.ml.feature import Tokenizer, RegexTokenizer @@ -272,8 +258,7 @@ words from the input sequences. The list of stopwords is specified by the `stopWords` parameter. We provide [a list of stop words](http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words) by default, accessible by calling `getStopWords` on a newly instantiated -`StopWordsRemover` instance. A boolean parameter `caseSensitive` indicates -if the matches should be case sensitive (false by default). +`StopWordsRemover` instance. **Examples** @@ -303,8 +288,10 @@ filtered out.
-Refer to the [StopWordsRemover Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover) -for more details on the API. +[`StopWordsRemover`](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover) +takes an input column name, an output column name, a list of stop words, +and a boolean indicating if the matches should be case sensitive (false +by default). {% highlight scala %} import org.apache.spark.ml.feature.StopWordsRemover @@ -323,8 +310,10 @@ remover.transform(dataSet).show()
-Refer to the [StopWordsRemover Java docs](api/java/org/apache/spark/ml/feature/StopWordsRemover.html) -for more details on the API. +[`StopWordsRemover`](api/java/org/apache/spark/ml/feature/StopWordsRemover.html) +takes an input column name, an output column name, a list of stop words, +and a boolean indicating if the matches should be case sensitive (false +by default). {% highlight java %} import java.util.Arrays; @@ -357,9 +346,10 @@ remover.transform(dataset).show();
- -Refer to the [StopWordsRemover Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover) -for more details on the API. +[`StopWordsRemover`](api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover) +takes an input column name, an output column name, a list of stop words, +and a boolean indicating if the matches should be case sensitive (false +by default). {% highlight python %} from pyspark.ml.feature import StopWordsRemover @@ -385,8 +375,7 @@ An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (t
-Refer to the [NGram Scala docs](api/scala/index.html#org.apache.spark.ml.feature.NGram) -for more details on the API. +[`NGram`](api/scala/index.html#org.apache.spark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default). {% highlight scala %} import org.apache.spark.ml.feature.NGram @@ -405,8 +394,7 @@ ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(pri
-Refer to the [NGram Java docs](api/java/org/apache/spark/ml/feature/NGram.html) -for more details on the API. +[`NGram`](api/java/org/apache/spark/ml/feature/NGram.html) takes an input column name, an output column name, and an optional length parameter n (n=2 by default). {% highlight java %} import java.util.Arrays; @@ -444,8 +432,7 @@ for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
-Refer to the [NGram Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.NGram) -for more details on the API. +[`NGram`](api/python/pyspark.ml.html#pyspark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default). {% highlight python %} from pyspark.ml.feature import NGram @@ -473,8 +460,7 @@ Binarization is the process of thresholding numerical features to binary (0/1) f
-Refer to the [Binarizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Binarizer) -for more details on the API. +Refer to the [Binarizer API doc](api/scala/index.html#org.apache.spark.ml.feature.Binarizer) for more details. {% highlight scala %} import org.apache.spark.ml.feature.Binarizer @@ -500,8 +486,7 @@ binarizedFeatures.collect().foreach(println)
-Refer to the [Binarizer Java docs](api/java/org/apache/spark/ml/feature/Binarizer.html) -for more details on the API. +Refer to the [Binarizer API doc](api/java/org/apache/spark/ml/feature/Binarizer.html) for more details. {% highlight java %} import java.util.Arrays; @@ -541,8 +526,7 @@ for (Row r : binarizedFeatures.collect()) {
-Refer to the [Binarizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer) -for more details on the API. +Refer to the [Binarizer API doc](api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer) for more details. {% highlight python %} from pyspark.ml.feature import Binarizer @@ -567,10 +551,7 @@ for binarized_feature, in binarizedFeatures.collect():
- -Refer to the [PCA Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PCA) -for more details on the API. - +See the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.feature.PCA) for API details. {% highlight scala %} import org.apache.spark.ml.feature.PCA import org.apache.spark.mllib.linalg.Vectors @@ -593,10 +574,7 @@ result.show()
- -Refer to the [PCA Java docs](api/java/org/apache/spark/ml/feature/PCA.html) -for more details on the API. - +See the [Java API documentation](api/java/org/apache/spark/ml/feature/PCA.html) for API details. {% highlight java %} import java.util.Arrays; @@ -636,10 +614,7 @@ result.show();
- -Refer to the [PCA Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PCA) -for more details on the API. - +See the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.feature.PCA) for API details. {% highlight python %} from pyspark.ml.feature import PCA from pyspark.mllib.linalg import Vectors @@ -662,10 +637,6 @@ result.show(truncate=False)
- -Refer to the [PolynomialExpansion Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion) -for more details on the API. - {% highlight scala %} import org.apache.spark.ml.feature.PolynomialExpansion import org.apache.spark.mllib.linalg.Vectors @@ -686,10 +657,6 @@ polyDF.select("polyFeatures").take(3).foreach(println)
- -Refer to the [PolynomialExpansion Java docs](api/java/org/apache/spark/ml/feature/PolynomialExpansion.html) -for more details on the API. - {% highlight java %} import java.util.Arrays; @@ -730,10 +697,6 @@ for (Row r : row) {
- -Refer to the [PolynomialExpansion Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion) -for more details on the API. - {% highlight python %} from pyspark.ml.feature import PolynomialExpansion from pyspark.mllib.linalg import Vectors @@ -767,10 +730,6 @@ $0$th DCT coefficient and _not_ the $N/2$th).
- -Refer to the [DCT Scala docs](api/scala/index.html#org.apache.spark.ml.feature.DCT) -for more details on the API. - {% highlight scala %} import org.apache.spark.ml.feature.DCT import org.apache.spark.mllib.linalg.Vectors @@ -790,10 +749,6 @@ dctDf.select("featuresDCT").show(3)
- -Refer to the [DCT Java docs](api/java/org/apache/spark/ml/feature/DCT.html) -for more details on the API. - {% highlight java %} import java.util.Arrays; @@ -878,8 +833,8 @@ index `2`.
-Refer to the [StringIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer) -for more details on the API. +[`StringIndexer`](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer) takes an input +column name and an output column name. {% highlight scala %} import org.apache.spark.ml.feature.StringIndexer @@ -896,9 +851,8 @@ indexed.show()
- -Refer to the [StringIndexer Java docs](api/java/org/apache/spark/ml/feature/StringIndexer.html) -for more details on the API. +[`StringIndexer`](api/java/org/apache/spark/ml/feature/StringIndexer.html) takes an input column +name and an output column name. {% highlight java %} import java.util.Arrays; @@ -935,8 +889,8 @@ indexed.show();
-Refer to the [StringIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer) -for more details on the API. +[`StringIndexer`](api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer) takes an input +column name and an output column name. {% highlight python %} from pyspark.ml.feature import StringIndexer @@ -957,10 +911,6 @@ indexed.show()
- -Refer to the [OneHotEncoder Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder) -for more details on the API. - {% highlight scala %} import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} @@ -987,10 +937,6 @@ encoded.select("id", "categoryVec").foreach(println)
- -Refer to the [OneHotEncoder Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoder.html) -for more details on the API. - {% highlight java %} import java.util.Arrays; @@ -1033,10 +979,6 @@ DataFrame encoded = encoder.transform(indexed);
- -Refer to the [OneHotEncoder Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder) -for more details on the API. - {% highlight python %} from pyspark.ml.feature import OneHotEncoder, StringIndexer @@ -1070,14 +1012,12 @@ It can both automatically decide which features are categorical and convert orig Indexing categorical features allows algorithms such as Decision Trees and Tree Ensembles to treat categorical features appropriately, improving performance. +Please refer to the [VectorIndexer API docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer) for more details. + In the example below, we read in a dataset of labeled points and then use `VectorIndexer` to decide which features should be treated as categorical. We transform the categorical feature values to their indices. This transformed data could then be passed to algorithms such as `DecisionTreeRegressor` that handle categorical features.
- -Refer to the [VectorIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer) -for more details on the API. - {% highlight scala %} import org.apache.spark.ml.feature.VectorIndexer @@ -1098,10 +1038,6 @@ val indexedData = indexerModel.transform(data)
- -Refer to the [VectorIndexer Java docs](api/java/org/apache/spark/ml/feature/VectorIndexer.html) -for more details on the API. - {% highlight java %} import java.util.Map; @@ -1129,10 +1065,6 @@ DataFrame indexedData = indexerModel.transform(data);
- -Refer to the [VectorIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorIndexer) -for more details on the API. - {% highlight python %} from pyspark.ml.feature import VectorIndexer @@ -1156,10 +1088,6 @@ The following example demonstrates how to load a dataset in libsvm format and th
- -Refer to the [Normalizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Normalizer) -for more details on the API. - {% highlight scala %} import org.apache.spark.ml.feature.Normalizer @@ -1179,10 +1107,6 @@ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.Positi
- -Refer to the [Normalizer Java docs](api/java/org/apache/spark/ml/feature/Normalizer.html) -for more details on the API. - {% highlight java %} import org.apache.spark.ml.feature.Normalizer; import org.apache.spark.sql.DataFrame; @@ -1204,10 +1128,6 @@ DataFrame lInfNormData =
- -Refer to the [Normalizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Normalizer) -for more details on the API. - {% highlight python %} from pyspark.ml.feature import Normalizer @@ -1236,14 +1156,14 @@ lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")}) Note that if the standard deviation of a feature is zero, it will return default `0.0` value in the `Vector` for that feature. +More details can be found in the API docs for +[StandardScaler](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler) and +[StandardScalerModel](api/scala/index.html#org.apache.spark.ml.feature.StandardScalerModel). + The following example demonstrates how to load a dataset in libsvm format and then normalize each feature to have unit standard deviation.
- -Refer to the [StandardScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler) -for more details on the API. - {% highlight scala %} import org.apache.spark.ml.feature.StandardScaler @@ -1264,10 +1184,6 @@ val scaledData = scalerModel.transform(dataFrame)
- -Refer to the [StandardScaler Java docs](api/java/org/apache/spark/ml/feature/StandardScaler.html) -for more details on the API. - {% highlight java %} import org.apache.spark.ml.feature.StandardScaler; import org.apache.spark.ml.feature.StandardScalerModel; @@ -1290,10 +1206,6 @@ DataFrame scaledData = scalerModel.transform(dataFrame);
- -Refer to the [StandardScaler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StandardScaler) -for more details on the API. - {% highlight python %} from pyspark.ml.feature import StandardScaler @@ -1332,11 +1244,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
- -Refer to the [MinMaxScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) -and the [MinMaxScalerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel) -for more details on the API. - +More details can be found in the API docs for +[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and +[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). {% highlight scala %} import org.apache.spark.ml.feature.MinMaxScaler @@ -1355,11 +1265,9 @@ val scaledData = scalerModel.transform(dataFrame)
- -Refer to the [MinMaxScaler Java docs](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) -and the [MinMaxScalerModel Java docs](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html) -for more details on the API. - +More details can be found in the API docs for +[MinMaxScaler](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) and +[MinMaxScalerModel](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html). {% highlight java %} import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.MinMaxScaler; @@ -1397,10 +1305,6 @@ The following example demonstrates how to bucketize a column of `Double`s into a
- -Refer to the [Bucketizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Bucketizer) -for more details on the API. - {% highlight scala %} import org.apache.spark.ml.feature.Bucketizer import org.apache.spark.sql.DataFrame @@ -1421,10 +1325,6 @@ val bucketedData = bucketizer.transform(dataFrame)
- -Refer to the [Bucketizer Java docs](api/java/org/apache/spark/ml/feature/Bucketizer.html) -for more details on the API. - {% highlight java %} import java.util.Arrays; @@ -1460,10 +1360,6 @@ DataFrame bucketedData = bucketizer.transform(dataFrame);
- -Refer to the [Bucketizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Bucketizer) -for more details on the API. - {% highlight python %} from pyspark.ml.feature import Bucketizer @@ -1500,14 +1396,14 @@ v_N \end{pmatrix} \]` +[`ElementwiseProduct`](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct) takes the following parameter: + +* `scalingVec`: the transforming vector. + This example below demonstrates how to transform vectors using a transforming vector value.
- -Refer to the [ElementwiseProduct Scala docs](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct) -for more details on the API. - {% highlight scala %} import org.apache.spark.ml.feature.ElementwiseProduct import org.apache.spark.mllib.linalg.Vectors @@ -1530,10 +1426,6 @@ transformer.transform(dataFrame).show()
- -Refer to the [ElementwiseProduct Java docs](api/java/org/apache/spark/ml/feature/ElementwiseProduct.html) -for more details on the API. - {% highlight java %} import java.util.Arrays; @@ -1572,10 +1464,6 @@ transformer.transform(dataFrame).show();
- -Refer to the [ElementwiseProduct Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct) -for more details on the API. - {% highlight python %} from pyspark.ml.feature import ElementwiseProduct from pyspark.mllib.linalg import Vectors @@ -1629,8 +1517,8 @@ output column to `features`, after transformation we should get the following Da
-Refer to the [VectorAssembler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler) -for more details on the API. +[`VectorAssembler`](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler) takes an array +of input column names and an output column name. {% highlight scala %} import org.apache.spark.mllib.linalg.Vectors @@ -1649,8 +1537,8 @@ println(output.select("features", "clicked").first())
-Refer to the [VectorAssembler Java docs](api/java/org/apache/spark/ml/feature/VectorAssembler.html) -for more details on the API. +[`VectorAssembler`](api/java/org/apache/spark/ml/feature/VectorAssembler.html) takes an array +of input column names and an output column name. {% highlight java %} import java.util.Arrays; @@ -1686,8 +1574,8 @@ System.out.println(output.select("features", "clicked").first());
-Refer to the [VectorAssembler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) -for more details on the API. +[`VectorAssembler`](api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) takes a list +of input column names and an output column name. {% highlight python %} from pyspark.mllib.linalg import Vectors @@ -1763,8 +1651,8 @@ Suppose also that we have a potential input attributes for the `userFeatures`, i
-Refer to the [VectorSlicer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer) -for more details on the API. +[`VectorSlicer`](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer) takes an input +column name with specified indices or names and an output column name. {% highlight scala %} import org.apache.spark.mllib.linalg.Vectors @@ -1797,8 +1685,8 @@ println(output.select("userFeatures", "features").first())
-Refer to the [VectorSlicer Java docs](api/java/org/apache/spark/ml/feature/VectorSlicer.html) -for more details on the API. +[`VectorSlicer`](api/java/org/apache/spark/ml/feature/VectorSlicer.html) takes an input column name +with specified indices or names and an output column name. {% highlight java %} import java.util.Arrays; @@ -1868,8 +1756,7 @@ id | country | hour | clicked | features | label
-Refer to the [RFormula Scala docs](api/scala/index.html#org.apache.spark.ml.feature.RFormula) -for more details on the API. +[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns. {% highlight scala %} import org.apache.spark.ml.feature.RFormula @@ -1890,8 +1777,7 @@ output.select("features", "label").show()
-Refer to the [RFormula Java docs](api/java/org/apache/spark/ml/feature/RFormula.html) -for more details on the API. +[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an R formula string, and optional parameters for the names of its output columns. {% highlight java %} import java.util.Arrays; @@ -1929,8 +1815,7 @@ output.select("features", "label").show();
-Refer to the [RFormula Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) -for more details on the API. +[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns. {% highlight python %} from pyspark.ml.feature import RFormula diff --git a/docs/programming-guide.md b/docs/programming-guide.md index f823b89a4b5e9..9bb0240e641c5 100644 --- a/docs/programming-guide.md +++ b/docs/programming-guide.md @@ -802,9 +802,9 @@ print("Counter value: " + counter) The primary challenge is that the behavior of the above code is undefined. In local mode with a single JVM, the above code will sum the values within the RDD and store it in **counter**. This is because both the RDD and the variable **counter** are in the same memory space on the driver node. -However, in `cluster` mode, what happens is more complicated, and the above may not work as intended. To execute jobs, Spark breaks up the processing of RDD operations into tasks - each of which is operated on by an executor. Prior to execution, Spark computes the **closure**. The closure is those variables and methods which must be visible for the executor to perform its computations on the RDD (in this case `foreach()`). This closure is serialized and sent to each executor. In `local` mode, there is only the one executors so everything shares the same closure. In other modes however, this is not the case and the executors running on separate worker nodes each have their own copy of the closure. +However, in `cluster` mode, what happens is more complicated, and the above may not work as intended. To execute jobs, Spark breaks up the processing of RDD operations into tasks - each of which is operated on by an executor. Prior to execution, Spark computes the **closure**. The closure is those variables and methods which must be visible for the executor to perform its computations on the RDD (in this case `foreach()`). This closure is serialized and sent to each executor. In `local` mode, there is only the one executors so everything shares the same closure. In other modes however, this is not the case and the executors running on seperate worker nodes each have their own copy of the closure. -What is happening here is that the variables within the closure sent to each executor are now copies and thus, when **counter** is referenced within the `foreach` function, it's no longer the **counter** on the driver node. There is still a **counter** in the memory of the driver node but this is no longer visible to the executors! The executors only see the copy from the serialized closure. Thus, the final value of **counter** will still be zero since all operations on **counter** were referencing the value within the serialized closure. +What is happening here is that the variables within the closure sent to each executor are now copies and thus, when **counter** is referenced within the `foreach` function, it's no longer the **counter** on the driver node. There is still a **counter** in the memory of the driver node but this is no longer visible to the executors! The executors only sees the copy from the serialized closure. Thus, the final value of **counter** will still be zero since all operations on **counter** were referencing the value within the serialized closure. To ensure well-defined behavior in these sorts of scenarios one should use an [`Accumulator`](#AccumLink). Accumulators in Spark are used specifically to provide a mechanism for safely updating a variable when execution is split up across worker nodes in a cluster. The Accumulators section of this guide discusses these in more detail. diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index db6bfa69ee0fe..3d0ad98fe68d3 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -16,19 +16,18 @@ containers used by the application use the same configuration. If the configurat Java system properties or environment variables not managed by YARN, they should also be set in the Spark application's configuration (driver, executors, and the AM when running in client mode). -There are two deploy modes that can be used to launch Spark applications on YARN. In `cluster` mode, the Spark driver runs inside an application master process which is managed by YARN on the cluster, and the client can go away after initiating the application. In `client` mode, the driver runs in the client process, and the application master is only used for requesting resources from YARN. +There are two deploy modes that can be used to launch Spark applications on YARN. In `yarn-cluster` mode, the Spark driver runs inside an application master process which is managed by YARN on the cluster, and the client can go away after initiating the application. In `yarn-client` mode, the driver runs in the client process, and the application master is only used for requesting resources from YARN. -Unlike [Spark standalone](spark-standalone.html) and [Mesos](running-on-mesos.html) modes, in which the master's address is specified in the `--master` parameter, in YARN mode the ResourceManager's address is picked up from the Hadoop configuration. Thus, the `--master` parameter is `yarn`. +Unlike [Spark standalone](spark-standalone.html) and [Mesos](running-on-mesos.html) modes, in which the master's address is specified in the `--master` parameter, in YARN mode the ResourceManager's address is picked up from the Hadoop configuration. Thus, the `--master` parameter is `yarn-client` or `yarn-cluster`. -To launch a Spark application in `cluster` mode: +To launch a Spark application in `yarn-cluster` mode: - $ ./bin/spark-submit --class path.to.your.Class --master yarn --deploy-mode cluster [options] [app options] + $ ./bin/spark-submit --class path.to.your.Class --master yarn-cluster [options] [app options] For example: $ ./bin/spark-submit --class org.apache.spark.examples.SparkPi \ - --master yarn \ - --deploy-mode cluster \ + --master yarn-cluster \ --driver-memory 4g \ --executor-memory 2g \ --executor-cores 1 \ @@ -38,17 +37,16 @@ For example: The above starts a YARN client program which starts the default Application Master. Then SparkPi will be run as a child thread of Application Master. The client will periodically poll the Application Master for status updates and display them in the console. The client will exit once your application has finished running. Refer to the "Debugging your Application" section below for how to see driver and executor logs. -To launch a Spark application in `client` mode, do the same, but replace `cluster` with `client`. The following shows how you can run `spark-shell` in `client` mode: +To launch a Spark application in `yarn-client` mode, do the same, but replace `yarn-cluster` with `yarn-client`. The following shows how you can run `spark-shell` in `yarn-client` mode: - $ ./bin/spark-shell --master yarn --deploy-mode client + $ ./bin/spark-shell --master yarn-client ## Adding Other JARs -In `cluster` mode, the driver runs on a different machine than the client, so `SparkContext.addJar` won't work out of the box with files that are local to the client. To make files on the client available to `SparkContext.addJar`, include them with the `--jars` option in the launch command. +In `yarn-cluster` mode, the driver runs on a different machine than the client, so `SparkContext.addJar` won't work out of the box with files that are local to the client. To make files on the client available to `SparkContext.addJar`, include them with the `--jars` option in the launch command. $ ./bin/spark-submit --class my.main.Class \ - --master yarn \ - --deploy-mode cluster \ + --master yarn-cluster \ --jars my-other-jar.jar,my-other-other-jar.jar my-main-jar.jar app_arg1 app_arg2 @@ -134,8 +132,8 @@ If you need a reference to the proper location to put log files in the YARN so t spark.yarn.am.waitTime 100s - In cluster mode, time for the YARN Application Master to wait for the - SparkContext to be initialized. In client mode, time for the YARN Application Master to wait + In yarn-cluster mode, time for the YARN Application Master to wait for the + SparkContext to be initialized. In yarn-client mode, time for the YARN Application Master to wait for the driver to connect to it. @@ -273,8 +271,8 @@ If you need a reference to the proper location to put log files in the YARN so t Add the environment variable specified by EnvironmentVariableName to the Application Master process launched on YARN. The user can specify multiple of - these and to set multiple environment variables. In cluster mode this controls - the environment of the Spark driver and in client mode it only controls + these and to set multiple environment variables. In yarn-cluster mode this controls + the environment of the Spark driver and in yarn-client mode it only controls the environment of the executor launcher. @@ -402,6 +400,6 @@ If you need a reference to the proper location to put log files in the YARN so t # Important notes - Whether core requests are honored in scheduling decisions depends on which scheduler is in use and how it is configured. -- In `cluster` mode, the local directories used by the Spark executors and the Spark driver will be the local directories configured for YARN (Hadoop YARN config `yarn.nodemanager.local-dirs`). If the user specifies `spark.local.dir`, it will be ignored. In `client` mode, the Spark executors will use the local directories configured for YARN while the Spark driver will use those defined in `spark.local.dir`. This is because the Spark driver does not run on the YARN cluster in `client` mode, only the Spark executors do. +- In `yarn-cluster` mode, the local directories used by the Spark executors and the Spark driver will be the local directories configured for YARN (Hadoop YARN config `yarn.nodemanager.local-dirs`). If the user specifies `spark.local.dir`, it will be ignored. In `yarn-client` mode, the Spark executors will use the local directories configured for YARN while the Spark driver will use those defined in `spark.local.dir`. This is because the Spark driver does not run on the YARN cluster in `yarn-client` mode, only the Spark executors do. - The `--files` and `--archives` options support specifying file names with the # similar to Hadoop. For example you can specify: `--files localtest.txt#appSees.txt` and this will upload the file you have locally named `localtest.txt` into HDFS but this will be linked to by the name `appSees.txt`, and your application should use the name as `appSees.txt` to reference it when running on YARN. -- The `--jars` option allows the `SparkContext.addJar` function to work if you are using it with local files and running in `cluster` mode. It does not need to be used if you are using it with HDFS, HTTP, HTTPS, or FTP files. +- The `--jars` option allows the `SparkContext.addJar` function to work if you are using it with local files and running in `yarn-cluster` mode. It does not need to be used if you are using it with HDFS, HTTP, HTTPS, or FTP files. diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 2fe5c36338899..5d5e797516758 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1623,7 +1623,7 @@ on all of the worker nodes, as they will need access to the Hive serialization a (SerDes) in order to access data stored in Hive. Configuration of Hive is done by placing your `hive-site.xml` file in `conf/`. Please note when running -the query on a YARN cluster (`cluster` mode), the `datanucleus` jars under the `lib_managed/jars` directory +the query on a YARN cluster (`yarn-cluster` mode), the `datanucleus` jars under the `lib_managed/jars` directory and `hive-site.xml` under `conf/` directory need to be available on the driver and all executors launched by the YARN cluster. The convenient way to do this is adding them through the `--jars` option and `--file` option of the `spark-submit` command. diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md index ac2a14eb56fea..915be0f479157 100644 --- a/docs/submitting-applications.md +++ b/docs/submitting-applications.md @@ -103,8 +103,7 @@ run it with `--help`. Here are a few examples of common options: export HADOOP_CONF_DIR=XXX ./bin/spark-submit \ --class org.apache.spark.examples.SparkPi \ - --master yarn \ - --deploy-mode cluster \ # can be client for client mode + --master yarn-cluster \ # can also be yarn-client for client mode --executor-memory 20G \ --num-executors 50 \ /path/to/examples.jar \ @@ -123,25 +122,21 @@ The master URL passed to Spark can be in one of the following formats: - - - - + + + - - - -
Master URLMeaning
local Run Spark locally with one worker thread (i.e. no parallelism at all).
local[K] Run Spark locally with K worker threads (ideally, set this to the number of cores on your machine).
local[*] Run Spark locally with as many worker threads as logical cores on your machine.
spark://HOST:PORT Connect to the given Spark standalone +
local Run Spark locally with one worker thread (i.e. no parallelism at all).
local[K] Run Spark locally with K worker threads (ideally, set this to the number of cores on your machine).
local[*] Run Spark locally with as many worker threads as logical cores on your machine.
spark://HOST:PORT Connect to the given Spark standalone cluster master. The port must be whichever one your master is configured to use, which is 7077 by default.
mesos://HOST:PORT Connect to the given Mesos cluster. +
mesos://HOST:PORT Connect to the given Mesos cluster. The port must be whichever one your is configured to use, which is 5050 by default. Or, for a Mesos cluster using ZooKeeper, use mesos://zk://....
yarn Connect to a YARN cluster in - client or cluster mode depending on the value of --deploy-mode. - The cluster location will be found based on the HADOOP_CONF_DIR or YARN_CONF_DIR variable. +
yarn-client Connect to a YARN cluster in +client mode. The cluster location will be found based on the HADOOP_CONF_DIR or YARN_CONF_DIR variable.
yarn-client Equivalent to yarn with --deploy-mode client, - which is preferred to `yarn-client` -
yarn-cluster Equivalent to yarn with --deploy-mode cluster, - which is preferred to `yarn-cluster` +
yarn-cluster Connect to a YARN cluster in +cluster mode. The cluster location will be found based on the HADOOP_CONF_DIR or YARN_CONF_DIR variable.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index 0b4c0d9ba9f8b..a377694507d29 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -219,11 +219,6 @@ public Vector predictRaw(Vector features) { */ public int numClasses() { return 2; } - /** - * Number of features the model was trained on. - */ - public int numFeatures() { return weights_.size(); } - /** * Create a copy of the model. * The copy is shallow, except for the embedded paramMap, which gets a deep copy. diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala index 3758edc56198a..340c3559b15ef 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala @@ -172,9 +172,6 @@ private class MyLogisticRegressionModel( /** Number of classes the label can take. 2 indicates binary classification. */ override val numClasses: Int = 2 - /** Number of features the model was trained on. */ - override val numFeatures: Int = weights.size - /** * Create a copy of the model. * The copy is shallow, except for the embedded paramMap, which gets a deep copy. diff --git a/licenses/LICENSE-AnchorJS.txt b/licenses/LICENSE-AnchorJS.txt deleted file mode 100644 index 2bf24b9b9f848..0000000000000 --- a/licenses/LICENSE-AnchorJS.txt +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-DPark.txt b/licenses/LICENSE-DPark.txt deleted file mode 100644 index 1d916090e4ea0..0000000000000 --- a/licenses/LICENSE-DPark.txt +++ /dev/null @@ -1,30 +0,0 @@ -Copyright (c) 2011, Douban Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - - * Neither the name of the Douban Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-Mockito.txt b/licenses/LICENSE-Mockito.txt deleted file mode 100644 index e0840a446caf5..0000000000000 --- a/licenses/LICENSE-Mockito.txt +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License - -Copyright (c) 2007 Mockito contributors - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-SnapTree.txt b/licenses/LICENSE-SnapTree.txt deleted file mode 100644 index a538825d89ec5..0000000000000 --- a/licenses/LICENSE-SnapTree.txt +++ /dev/null @@ -1,35 +0,0 @@ -SNAPTREE LICENSE - -Copyright (c) 2009-2012 Stanford University, unless otherwise specified. -All rights reserved. - -This software was developed by the Pervasive Parallelism Laboratory of -Stanford University, California, USA. - -Permission to use, copy, modify, and distribute this software in source -or binary form for any purpose with or without fee is hereby granted, -provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - 3. Neither the name of Stanford University nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - -THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -SUCH DAMAGE. diff --git a/licenses/LICENSE-antlr.txt b/licenses/LICENSE-antlr.txt deleted file mode 100644 index 3021ea04332ed..0000000000000 --- a/licenses/LICENSE-antlr.txt +++ /dev/null @@ -1,8 +0,0 @@ -[The BSD License] -Copyright (c) 2012 Terence Parr and Sam Harwell -All rights reserved. -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: -Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-boto.txt b/licenses/LICENSE-boto.txt deleted file mode 100644 index 7bba0cd9e10a4..0000000000000 --- a/licenses/LICENSE-boto.txt +++ /dev/null @@ -1,20 +0,0 @@ -Copyright (c) 2006-2008 Mitch Garnaat http://garnaat.org/ - -Permission is hereby granted, free of charge, to any person obtaining a -copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, dis- -tribute, sublicense, and/or sell copies of the Software, and to permit -persons to whom the Software is furnished to do so, subject to the fol- -lowing conditions: - -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- -ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT -SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -IN THE SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-cloudpickle.txt b/licenses/LICENSE-cloudpickle.txt deleted file mode 100644 index b1e20fa1eda88..0000000000000 --- a/licenses/LICENSE-cloudpickle.txt +++ /dev/null @@ -1,28 +0,0 @@ -Copyright (c) 2012, Regents of the University of California. -Copyright (c) 2009 `PiCloud, Inc. `_. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the University of California, Berkeley nor the - names of its contributors may be used to endorse or promote - products derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-d3.min.js.txt b/licenses/LICENSE-d3.min.js.txt deleted file mode 100644 index c71e3f254c068..0000000000000 --- a/licenses/LICENSE-d3.min.js.txt +++ /dev/null @@ -1,26 +0,0 @@ -Copyright (c) 2010-2015, Michael Bostock -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* The name Michael Bostock may not be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, -INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-dagre-d3.txt b/licenses/LICENSE-dagre-d3.txt deleted file mode 100644 index 4864fe05e9803..0000000000000 --- a/licenses/LICENSE-dagre-d3.txt +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2013 Chris Pettitt - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-f2j.txt b/licenses/LICENSE-f2j.txt deleted file mode 100644 index e28fd3ccdfa69..0000000000000 --- a/licenses/LICENSE-f2j.txt +++ /dev/null @@ -1,8 +0,0 @@ -Copyright © 2015 The University of Tennessee. All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: -· Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -· Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution. -· Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -This software is provided by the copyright holders and contributors "as is" and any express or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. in no event shall the copyright owner or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage. \ No newline at end of file diff --git a/licenses/LICENSE-graphlib-dot.txt b/licenses/LICENSE-graphlib-dot.txt deleted file mode 100644 index c9e18cd562423..0000000000000 --- a/licenses/LICENSE-graphlib-dot.txt +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2012-2013 Chris Pettitt - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-heapq.txt b/licenses/LICENSE-heapq.txt deleted file mode 100644 index 0c4c4b954bea4..0000000000000 --- a/licenses/LICENSE-heapq.txt +++ /dev/null @@ -1,280 +0,0 @@ - -# A. HISTORY OF THE SOFTWARE -# ========================== -# -# Python was created in the early 1990s by Guido van Rossum at Stichting -# Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands -# as a successor of a language called ABC. Guido remains Python's -# principal author, although it includes many contributions from others. -# -# In 1995, Guido continued his work on Python at the Corporation for -# National Research Initiatives (CNRI, see http://www.cnri.reston.va.us) -# in Reston, Virginia where he released several versions of the -# software. -# -# In May 2000, Guido and the Python core development team moved to -# BeOpen.com to form the BeOpen PythonLabs team. In October of the same -# year, the PythonLabs team moved to Digital Creations (now Zope -# Corporation, see http://www.zope.com). In 2001, the Python Software -# Foundation (PSF, see http://www.python.org/psf/) was formed, a -# non-profit organization created specifically to own Python-related -# Intellectual Property. Zope Corporation is a sponsoring member of -# the PSF. -# -# All Python releases are Open Source (see http://www.opensource.org for -# the Open Source Definition). Historically, most, but not all, Python -# releases have also been GPL-compatible; the table below summarizes -# the various releases. -# -# Release Derived Year Owner GPL- -# from compatible? (1) -# -# 0.9.0 thru 1.2 1991-1995 CWI yes -# 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes -# 1.6 1.5.2 2000 CNRI no -# 2.0 1.6 2000 BeOpen.com no -# 1.6.1 1.6 2001 CNRI yes (2) -# 2.1 2.0+1.6.1 2001 PSF no -# 2.0.1 2.0+1.6.1 2001 PSF yes -# 2.1.1 2.1+2.0.1 2001 PSF yes -# 2.2 2.1.1 2001 PSF yes -# 2.1.2 2.1.1 2002 PSF yes -# 2.1.3 2.1.2 2002 PSF yes -# 2.2.1 2.2 2002 PSF yes -# 2.2.2 2.2.1 2002 PSF yes -# 2.2.3 2.2.2 2003 PSF yes -# 2.3 2.2.2 2002-2003 PSF yes -# 2.3.1 2.3 2002-2003 PSF yes -# 2.3.2 2.3.1 2002-2003 PSF yes -# 2.3.3 2.3.2 2002-2003 PSF yes -# 2.3.4 2.3.3 2004 PSF yes -# 2.3.5 2.3.4 2005 PSF yes -# 2.4 2.3 2004 PSF yes -# 2.4.1 2.4 2005 PSF yes -# 2.4.2 2.4.1 2005 PSF yes -# 2.4.3 2.4.2 2006 PSF yes -# 2.4.4 2.4.3 2006 PSF yes -# 2.5 2.4 2006 PSF yes -# 2.5.1 2.5 2007 PSF yes -# 2.5.2 2.5.1 2008 PSF yes -# 2.5.3 2.5.2 2008 PSF yes -# 2.6 2.5 2008 PSF yes -# 2.6.1 2.6 2008 PSF yes -# 2.6.2 2.6.1 2009 PSF yes -# 2.6.3 2.6.2 2009 PSF yes -# 2.6.4 2.6.3 2009 PSF yes -# 2.6.5 2.6.4 2010 PSF yes -# 2.7 2.6 2010 PSF yes -# -# Footnotes: -# -# (1) GPL-compatible doesn't mean that we're distributing Python under -# the GPL. All Python licenses, unlike the GPL, let you distribute -# a modified version without making your changes open source. The -# GPL-compatible licenses make it possible to combine Python with -# other software that is released under the GPL; the others don't. -# -# (2) According to Richard Stallman, 1.6.1 is not GPL-compatible, -# because its license has a choice of law clause. According to -# CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 -# is "not incompatible" with the GPL. -# -# Thanks to the many outside volunteers who have worked under Guido's -# direction to make these releases possible. -# -# -# B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON -# =============================================================== -# -# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 -# -------------------------------------------- -# -# 1. This LICENSE AGREEMENT is between the Python Software Foundation -# ("PSF"), and the Individual or Organization ("Licensee") accessing and -# otherwise using this software ("Python") in source or binary form and -# its associated documentation. -# -# 2. Subject to the terms and conditions of this License Agreement, PSF hereby -# grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, -# analyze, test, perform and/or display publicly, prepare derivative works, -# distribute, and otherwise use Python alone or in any derivative version, -# provided, however, that PSF's License Agreement and PSF's notice of copyright, -# i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, -# 2011, 2012, 2013 Python Software Foundation; All Rights Reserved" are retained -# in Python alone or in any derivative version prepared by Licensee. -# -# 3. In the event Licensee prepares a derivative work that is based on -# or incorporates Python or any part thereof, and wants to make -# the derivative work available to others as provided herein, then -# Licensee hereby agrees to include in any such work a brief summary of -# the changes made to Python. -# -# 4. PSF is making Python available to Licensee on an "AS IS" -# basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -# IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND -# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT -# INFRINGE ANY THIRD PARTY RIGHTS. -# -# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON -# FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS -# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, -# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. -# -# 6. This License Agreement will automatically terminate upon a material -# breach of its terms and conditions. -# -# 7. Nothing in this License Agreement shall be deemed to create any -# relationship of agency, partnership, or joint venture between PSF and -# Licensee. This License Agreement does not grant permission to use PSF -# trademarks or trade name in a trademark sense to endorse or promote -# products or services of Licensee, or any third party. -# -# 8. By copying, installing or otherwise using Python, Licensee -# agrees to be bound by the terms and conditions of this License -# Agreement. -# -# -# BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 -# ------------------------------------------- -# -# BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 -# -# 1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an -# office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the -# Individual or Organization ("Licensee") accessing and otherwise using -# this software in source or binary form and its associated -# documentation ("the Software"). -# -# 2. Subject to the terms and conditions of this BeOpen Python License -# Agreement, BeOpen hereby grants Licensee a non-exclusive, -# royalty-free, world-wide license to reproduce, analyze, test, perform -# and/or display publicly, prepare derivative works, distribute, and -# otherwise use the Software alone or in any derivative version, -# provided, however, that the BeOpen Python License is retained in the -# Software, alone or in any derivative version prepared by Licensee. -# -# 3. BeOpen is making the Software available to Licensee on an "AS IS" -# basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -# IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND -# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT -# INFRINGE ANY THIRD PARTY RIGHTS. -# -# 4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE -# SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS -# AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY -# DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. -# -# 5. This License Agreement will automatically terminate upon a material -# breach of its terms and conditions. -# -# 6. This License Agreement shall be governed by and interpreted in all -# respects by the law of the State of California, excluding conflict of -# law provisions. Nothing in this License Agreement shall be deemed to -# create any relationship of agency, partnership, or joint venture -# between BeOpen and Licensee. This License Agreement does not grant -# permission to use BeOpen trademarks or trade names in a trademark -# sense to endorse or promote products or services of Licensee, or any -# third party. As an exception, the "BeOpen Python" logos available at -# http://www.pythonlabs.com/logos.html may be used according to the -# permissions granted on that web page. -# -# 7. By copying, installing or otherwise using the software, Licensee -# agrees to be bound by the terms and conditions of this License -# Agreement. -# -# -# CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 -# --------------------------------------- -# -# 1. This LICENSE AGREEMENT is between the Corporation for National -# Research Initiatives, having an office at 1895 Preston White Drive, -# Reston, VA 20191 ("CNRI"), and the Individual or Organization -# ("Licensee") accessing and otherwise using Python 1.6.1 software in -# source or binary form and its associated documentation. -# -# 2. Subject to the terms and conditions of this License Agreement, CNRI -# hereby grants Licensee a nonexclusive, royalty-free, world-wide -# license to reproduce, analyze, test, perform and/or display publicly, -# prepare derivative works, distribute, and otherwise use Python 1.6.1 -# alone or in any derivative version, provided, however, that CNRI's -# License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) -# 1995-2001 Corporation for National Research Initiatives; All Rights -# Reserved" are retained in Python 1.6.1 alone or in any derivative -# version prepared by Licensee. Alternately, in lieu of CNRI's License -# Agreement, Licensee may substitute the following text (omitting the -# quotes): "Python 1.6.1 is made available subject to the terms and -# conditions in CNRI's License Agreement. This Agreement together with -# Python 1.6.1 may be located on the Internet using the following -# unique, persistent identifier (known as a handle): 1895.22/1013. This -# Agreement may also be obtained from a proxy server on the Internet -# using the following URL: http://hdl.handle.net/1895.22/1013". -# -# 3. In the event Licensee prepares a derivative work that is based on -# or incorporates Python 1.6.1 or any part thereof, and wants to make -# the derivative work available to others as provided herein, then -# Licensee hereby agrees to include in any such work a brief summary of -# the changes made to Python 1.6.1. -# -# 4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" -# basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -# IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND -# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT -# INFRINGE ANY THIRD PARTY RIGHTS. -# -# 5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON -# 1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS -# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, -# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. -# -# 6. This License Agreement will automatically terminate upon a material -# breach of its terms and conditions. -# -# 7. This License Agreement shall be governed by the federal -# intellectual property law of the United States, including without -# limitation the federal copyright law, and, to the extent such -# U.S. federal law does not apply, by the law of the Commonwealth of -# Virginia, excluding Virginia's conflict of law provisions. -# Notwithstanding the foregoing, with regard to derivative works based -# on Python 1.6.1 that incorporate non-separable material that was -# previously distributed under the GNU General Public License (GPL), the -# law of the Commonwealth of Virginia shall govern this License -# Agreement only as to issues arising under or with respect to -# Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this -# License Agreement shall be deemed to create any relationship of -# agency, partnership, or joint venture between CNRI and Licensee. This -# License Agreement does not grant permission to use CNRI trademarks or -# trade name in a trademark sense to endorse or promote products or -# services of Licensee, or any third party. -# -# 8. By clicking on the "ACCEPT" button where indicated, or by copying, -# installing or otherwise using Python 1.6.1, Licensee agrees to be -# bound by the terms and conditions of this License Agreement. -# -# ACCEPT -# -# -# CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 -# -------------------------------------------------- -# -# Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, -# The Netherlands. All rights reserved. -# -# Permission to use, copy, modify, and distribute this software and its -# documentation for any purpose and without fee is hereby granted, -# provided that the above copyright notice appear in all copies and that -# both that copyright notice and this permission notice appear in -# supporting documentation, and that the name of Stichting Mathematisch -# Centrum or CWI not be used in advertising or publicity pertaining to -# distribution of the software without specific, written prior -# permission. -# -# STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO -# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND -# FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE -# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT -# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-javolution.txt b/licenses/LICENSE-javolution.txt deleted file mode 100644 index b64af4d8298aa..0000000000000 --- a/licenses/LICENSE-javolution.txt +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Javolution - Java(tm) Solution for Real-Time and Embedded Systems - * Copyright (c) 2012, Javolution (http://javolution.org/) - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ \ No newline at end of file diff --git a/licenses/LICENSE-jbcrypt.txt b/licenses/LICENSE-jbcrypt.txt deleted file mode 100644 index d332534c06356..0000000000000 --- a/licenses/LICENSE-jbcrypt.txt +++ /dev/null @@ -1,17 +0,0 @@ -jBCrypt is subject to the following license: - -/* - * Copyright (c) 2006 Damien Miller - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ diff --git a/licenses/LICENSE-jblas.txt b/licenses/LICENSE-jblas.txt deleted file mode 100644 index 5629dafb65b39..0000000000000 --- a/licenses/LICENSE-jblas.txt +++ /dev/null @@ -1,31 +0,0 @@ -Copyright (c) 2009, Mikio L. Braun and contributors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the Technische Universität Berlin nor the - names of its contributors may be used to endorse or promote - products derived from this software without specific prior - written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-jline.txt b/licenses/LICENSE-jline.txt deleted file mode 100644 index 2ec539d10ac54..0000000000000 --- a/licenses/LICENSE-jline.txt +++ /dev/null @@ -1,32 +0,0 @@ -Copyright (c) 2002-2006, Marc Prud'hommeaux -All rights reserved. - -Redistribution and use in source and binary forms, with or -without modification, are permitted provided that the following -conditions are met: - -Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with -the distribution. - -Neither the name of JLine nor the names of its contributors -may be used to endorse or promote products derived from this -software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, -BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY -AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED -AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED -OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-jpmml-model.txt b/licenses/LICENSE-jpmml-model.txt deleted file mode 100644 index 69411d1c6e9a8..0000000000000 --- a/licenses/LICENSE-jpmml-model.txt +++ /dev/null @@ -1,10 +0,0 @@ -Copyright (c) 2009, University of Tartu -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-jquery.txt b/licenses/LICENSE-jquery.txt deleted file mode 100644 index e1dd696d3b6cc..0000000000000 --- a/licenses/LICENSE-jquery.txt +++ /dev/null @@ -1,9 +0,0 @@ -The MIT License (MIT) - -Copyright (c) - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-junit-interface.txt b/licenses/LICENSE-junit-interface.txt deleted file mode 100644 index e835350c4e2a4..0000000000000 --- a/licenses/LICENSE-junit-interface.txt +++ /dev/null @@ -1,24 +0,0 @@ -Copyright (c) 2009-2012, Stefan Zeiger -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-kryo.txt b/licenses/LICENSE-kryo.txt deleted file mode 100644 index 3f6a160c238e5..0000000000000 --- a/licenses/LICENSE-kryo.txt +++ /dev/null @@ -1,10 +0,0 @@ -Copyright (c) 2008, Nathan Sweet -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-minlog.txt b/licenses/LICENSE-minlog.txt deleted file mode 100644 index 3f6a160c238e5..0000000000000 --- a/licenses/LICENSE-minlog.txt +++ /dev/null @@ -1,10 +0,0 @@ -Copyright (c) 2008, Nathan Sweet -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-netlib.txt b/licenses/LICENSE-netlib.txt deleted file mode 100644 index 75783ed6bc357..0000000000000 --- a/licenses/LICENSE-netlib.txt +++ /dev/null @@ -1,49 +0,0 @@ -Copyright (c) 2013 Samuel Halliday -Copyright (c) 1992-2011 The University of Tennessee and The University - of Tennessee Research Foundation. All rights - reserved. -Copyright (c) 2000-2011 The University of California Berkeley. All - rights reserved. -Copyright (c) 2006-2011 The University of Colorado Denver. All rights - reserved. - -$COPYRIGHT$ - -Additional copyrights may follow - -$HEADER$ - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -- Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -- Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer listed - in this license in the documentation and/or other materials - provided with the distribution. - -- Neither the name of the copyright holders nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -The copyright holders provide no reassurances that the source code -provided does not infringe any patent, copyright, or any other -intellectual property rights of third parties. The copyright holders -disclaim any liability to any recipient for claims brought against -recipient by any third party for infringement of that parties -intellectual property rights. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-paranamer.txt b/licenses/LICENSE-paranamer.txt deleted file mode 100644 index fca18473ba03f..0000000000000 --- a/licenses/LICENSE-paranamer.txt +++ /dev/null @@ -1,28 +0,0 @@ -[ ParaNamer used to be 'Pubic Domain', but since it includes a small piece of ASM it is now the same license as that: BSD ] - - Copyright (c) 2006 Paul Hammant & ThoughtWorks Inc - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - 3. Neither the name of the copyright holders nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-protobuf.txt b/licenses/LICENSE-protobuf.txt deleted file mode 100644 index b4350ec83c758..0000000000000 --- a/licenses/LICENSE-protobuf.txt +++ /dev/null @@ -1,42 +0,0 @@ -This license applies to all parts of Protocol Buffers except the following: - - - Atomicops support for generic gcc, located in - src/google/protobuf/stubs/atomicops_internals_generic_gcc.h. - This file is copyrighted by Red Hat Inc. - - - Atomicops support for AIX/POWER, located in - src/google/protobuf/stubs/atomicops_internals_aix.h. - This file is copyrighted by Bloomberg Finance LP. - -Copyright 2014, Google Inc. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Code generated by the Protocol Buffer compiler is owned by the owner -of the input file used when generating it. This code is not -standalone and requires a support library to be linked with it. This -support library is itself covered by the above license. \ No newline at end of file diff --git a/licenses/LICENSE-py4j.txt b/licenses/LICENSE-py4j.txt deleted file mode 100644 index 70af3e69ed67a..0000000000000 --- a/licenses/LICENSE-py4j.txt +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -- Redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. - -- Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -- The name of the author may not be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - diff --git a/licenses/LICENSE-pyrolite.txt b/licenses/LICENSE-pyrolite.txt deleted file mode 100644 index 9457c7aa66140..0000000000000 --- a/licenses/LICENSE-pyrolite.txt +++ /dev/null @@ -1,28 +0,0 @@ - -Pyro - Python Remote Objects -Software License, copyright, and disclaimer - - Pyro is Copyright (c) by Irmen de Jong (irmen@razorvine.net). - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - -This is the "MIT Software License" which is OSI-certified, and GPL-compatible. -See http://www.opensource.org/licenses/mit-license.php - diff --git a/licenses/LICENSE-reflectasm.txt b/licenses/LICENSE-reflectasm.txt deleted file mode 100644 index 3f6a160c238e5..0000000000000 --- a/licenses/LICENSE-reflectasm.txt +++ /dev/null @@ -1,10 +0,0 @@ -Copyright (c) 2008, Nathan Sweet -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-sbt-launch-lib.txt b/licenses/LICENSE-sbt-launch-lib.txt deleted file mode 100644 index 3b9156baaab78..0000000000000 --- a/licenses/LICENSE-sbt-launch-lib.txt +++ /dev/null @@ -1,26 +0,0 @@ -// Generated from http://www.opensource.org/licenses/bsd-license.php -Copyright (c) 2011, Paul Phillips. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of the author nor the names of its contributors may be - used to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-scala.txt b/licenses/LICENSE-scala.txt deleted file mode 100644 index 4846076aba246..0000000000000 --- a/licenses/LICENSE-scala.txt +++ /dev/null @@ -1,30 +0,0 @@ -Copyright (c) 2002-2013 EPFL -Copyright (c) 2011-2013 Typesafe, Inc. - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -- Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -- Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -- Neither the name of the EPFL nor the names of its contributors may be - used to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. diff --git a/licenses/LICENSE-scalacheck.txt b/licenses/LICENSE-scalacheck.txt deleted file mode 100644 index cb8f97842f4c4..0000000000000 --- a/licenses/LICENSE-scalacheck.txt +++ /dev/null @@ -1,32 +0,0 @@ -ScalaCheck LICENSE - -Copyright (c) 2007-2015, Rickard Nilsson -All rights reserved. - -Permission to use, copy, modify, and distribute this software in source -or binary form for any purpose with or without fee is hereby granted, -provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - 3. Neither the name of the author nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - -THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-scopt.txt b/licenses/LICENSE-scopt.txt deleted file mode 100644 index 2bf24b9b9f848..0000000000000 --- a/licenses/LICENSE-scopt.txt +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-slf4j.txt b/licenses/LICENSE-slf4j.txt deleted file mode 100644 index 6548cd3af4322..0000000000000 --- a/licenses/LICENSE-slf4j.txt +++ /dev/null @@ -1,21 +0,0 @@ -Copyright (c) 2004-2013 QOS.ch - All rights reserved. - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-sorttable.js.txt b/licenses/LICENSE-sorttable.js.txt deleted file mode 100644 index b31a5b206bf40..0000000000000 --- a/licenses/LICENSE-sorttable.js.txt +++ /dev/null @@ -1,16 +0,0 @@ -Copyright (c) 1997-2007 Stuart Langridge - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/licenses/LICENSE-spire.txt b/licenses/LICENSE-spire.txt deleted file mode 100644 index 40af7746b9315..0000000000000 --- a/licenses/LICENSE-spire.txt +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2011-2012 Erik Osheim, Tom Switzer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-xmlenc.txt b/licenses/LICENSE-xmlenc.txt deleted file mode 100644 index 3a70c9bfcdadd..0000000000000 --- a/licenses/LICENSE-xmlenc.txt +++ /dev/null @@ -1,27 +0,0 @@ -Copyright 2003-2005, Ernst de Haan -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/make-distribution.sh b/make-distribution.sh index e1c2afdbc6d87..cbb1e0c2fd530 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -198,7 +198,6 @@ fi # Copy license and ASF files cp "$SPARK_HOME/LICENSE" "$DISTDIR" -cp -r "$SPARK_HOME/licenses" "$DISTDIR" cp "$SPARK_HOME/NOTICE" "$DISTDIR" if [ -e "$SPARK_HOME"/CHANGES.txt ]; then diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala index e0dcd427fae24..19fe039b8fd03 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml -import org.apache.spark.annotation.{DeveloperApi, Since} +import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.SchemaUtils @@ -145,10 +145,6 @@ abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, /** @group setParam */ def setPredictionCol(value: String): M = set(predictionCol, value).asInstanceOf[M] - /** Returns the number of features the model was trained on. If unknown, returns -1 */ - @Since("1.6.0") - def numFeatures: Int = -1 - /** * Returns the SQL DataType corresponding to the FeaturesType type parameter. * diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala index a7c10333c0d53..e479f169021d8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala @@ -124,28 +124,18 @@ private[attribute] trait AttributeFactory { private[attribute] def fromMetadata(metadata: Metadata): Attribute /** - * Creates an [[Attribute]] from a [[StructField]] instance, optionally preserving name. + * Creates an [[Attribute]] from a [[StructField]] instance. */ - private[ml] def decodeStructField(field: StructField, preserveName: Boolean): Attribute = { + def fromStructField(field: StructField): Attribute = { require(field.dataType.isInstanceOf[NumericType]) val metadata = field.metadata val mlAttr = AttributeKeys.ML_ATTR if (metadata.contains(mlAttr)) { - val attr = fromMetadata(metadata.getMetadata(mlAttr)) - if (preserveName) { - attr - } else { - attr.withName(field.name) - } + fromMetadata(metadata.getMetadata(mlAttr)).withName(field.name) } else { UnresolvedAttribute } } - - /** - * Creates an [[Attribute]] from a [[StructField]] instance. - */ - def fromStructField(field: StructField): Attribute = decodeStructField(field, false) } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index c478aea44ace8..d064542503482 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.HasCheckpointInterval import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeClassifierParams} import org.apache.spark.ml.tree.impl.RandomForest import org.apache.spark.ml.util.{Identifiable, MetadataUtils} @@ -108,7 +109,6 @@ object DecisionTreeClassifier { final class DecisionTreeClassificationModel private[ml] ( override val uid: String, override val rootNode: Node, - override val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, DecisionTreeClassificationModel] with DecisionTreeModel with Serializable { @@ -120,8 +120,8 @@ final class DecisionTreeClassificationModel private[ml] ( * Construct a decision tree classification model. * @param rootNode Root node of tree, with other nodes attached. */ - private[ml] def this(rootNode: Node, numFeatures: Int, numClasses: Int) = - this(Identifiable.randomUID("dtc"), rootNode, numFeatures, numClasses) + private[ml] def this(rootNode: Node, numClasses: Int) = + this(Identifiable.randomUID("dtc"), rootNode, numClasses) override protected def predict(features: Vector): Double = { rootNode.predictImpl(features).prediction @@ -143,7 +143,7 @@ final class DecisionTreeClassificationModel private[ml] ( } override def copy(extra: ParamMap): DecisionTreeClassificationModel = { - copyValues(new DecisionTreeClassificationModel(uid, rootNode, numFeatures, numClasses), extra) + copyValues(new DecisionTreeClassificationModel(uid, rootNode, numClasses), extra) .setParent(parent) } @@ -163,14 +163,12 @@ private[ml] object DecisionTreeClassificationModel { def fromOld( oldModel: OldDecisionTreeModel, parent: DecisionTreeClassifier, - categoricalFeatures: Map[Int, Int], - numFeatures: Int = -1): DecisionTreeClassificationModel = { + categoricalFeatures: Map[Int, Int]): DecisionTreeClassificationModel = { require(oldModel.algo == OldAlgo.Classification, s"Cannot convert non-classification DecisionTreeModel (old API) to" + s" DecisionTreeClassificationModel (new API). Algo is: ${oldModel.algo}") val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures) val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtc") - // Can't infer number of features from old model, so default to -1 - new DecisionTreeClassificationModel(uid, rootNode, numFeatures, -1) + new DecisionTreeClassificationModel(uid, rootNode, -1) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index 74aef94bf7675..ad8683648b975 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -33,7 +33,7 @@ import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo} import org.apache.spark.mllib.tree.loss.{LogLoss => OldLogLoss, Loss => OldLoss} import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel => OldGBTModel} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Row, DataFrame} +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @@ -138,11 +138,10 @@ final class GBTClassifier(override val uid: String) require(numClasses == 2, s"GBTClassifier only supports binary classification but was given numClasses = $numClasses") val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset) - val numFeatures = oldDataset.first().features.size val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Classification) val oldGBT = new OldGBT(boostingStrategy) val oldModel = oldGBT.run(oldDataset) - GBTClassificationModel.fromOld(oldModel, this, categoricalFeatures, numFeatures) + GBTClassificationModel.fromOld(oldModel, this, categoricalFeatures) } override def copy(extra: ParamMap): GBTClassifier = defaultCopy(extra) @@ -165,11 +164,10 @@ object GBTClassifier { * @param _treeWeights Weights for the decision trees in the ensemble. */ @Experimental -final class GBTClassificationModel private[ml]( +final class GBTClassificationModel( override val uid: String, private val _trees: Array[DecisionTreeRegressionModel], - private val _treeWeights: Array[Double], - override val numFeatures: Int) + private val _treeWeights: Array[Double]) extends PredictionModel[Vector, GBTClassificationModel] with TreeEnsembleModel with Serializable { @@ -177,14 +175,6 @@ final class GBTClassificationModel private[ml]( require(_trees.length == _treeWeights.length, "GBTClassificationModel given trees, treeWeights" + s" of non-matching lengths (${_trees.length}, ${_treeWeights.length}, respectively).") - /** - * Construct a GBTClassificationModel - * @param _trees Decision trees in the ensemble. - * @param _treeWeights Weights for the decision trees in the ensemble. - */ - def this(uid: String, _trees: Array[DecisionTreeRegressionModel], _treeWeights: Array[Double]) = - this(uid, _trees, _treeWeights, -1) - override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]] override def treeWeights: Array[Double] = _treeWeights @@ -206,8 +196,7 @@ final class GBTClassificationModel private[ml]( } override def copy(extra: ParamMap): GBTClassificationModel = { - copyValues(new GBTClassificationModel(uid, _trees, _treeWeights, numFeatures), - extra).setParent(parent) + copyValues(new GBTClassificationModel(uid, _trees, _treeWeights), extra).setParent(parent) } override def toString: String = { @@ -226,8 +215,7 @@ private[ml] object GBTClassificationModel { def fromOld( oldModel: OldGBTModel, parent: GBTClassifier, - categoricalFeatures: Map[Int, Int], - numFeatures: Int = -1): GBTClassificationModel = { + categoricalFeatures: Map[Int, Int]): GBTClassificationModel = { require(oldModel.algo == OldAlgo.Classification, "Cannot convert GradientBoostedTreesModel" + s" with algo=${oldModel.algo} (old API) to GBTClassificationModel (new API).") val newTrees = oldModel.trees.map { tree => @@ -235,6 +223,6 @@ private[ml] object GBTClassificationModel { DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures) } val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtc") - new GBTClassificationModel(parent.uid, newTrees, oldModel.treeWeights, numFeatures) + new GBTClassificationModel(parent.uid, newTrees, oldModel.treeWeights) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index cd7462596dd9e..5f60dea91fcfa 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -181,8 +181,6 @@ class MultilayerPerceptronClassificationModel private[ml] ( extends PredictionModel[Vector, MultilayerPerceptronClassificationModel] with Serializable { - override val numFeatures: Int = layers.head - private val mlpModel = FeedForwardTopology.multiLayerPerceptron(layers, true).getInstance(weights) /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index a14dcecbaf5b9..082ea1ffad58f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -137,8 +137,6 @@ class NaiveBayesModel private[ml] ( throw new UnknownError(s"Invalid modelType: ${$(modelType)}.") } - override val numFeatures: Int = theta.numCols - override val numClasses: Int = pi.size private def multinomialCalculation(features: Vector) = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index bae329692a68d..a6ebee1bb10af 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -119,12 +119,13 @@ object RandomForestClassifier { * features. * @param _trees Decision trees in the ensemble. * Warning: These have null parents. + * @param numFeatures Number of features used by this model */ @Experimental final class RandomForestClassificationModel private[ml] ( override val uid: String, private val _trees: Array[DecisionTreeClassificationModel], - override val numFeatures: Int, + val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, RandomForestClassificationModel] with TreeEnsembleModel with Serializable { @@ -225,8 +226,7 @@ private[ml] object RandomForestClassificationModel { oldModel: OldRandomForestModel, parent: RandomForestClassifier, categoricalFeatures: Map[Int, Int], - numClasses: Int, - numFeatures: Int = -1): RandomForestClassificationModel = { + numClasses: Int): RandomForestClassificationModel = { require(oldModel.algo == OldAlgo.Classification, "Cannot convert RandomForestModel" + s" with algo=${oldModel.algo} (old API) to RandomForestClassificationModel (new API).") val newTrees = oldModel.trees.map { tree => @@ -234,6 +234,6 @@ private[ml] object RandomForestClassificationModel { DecisionTreeClassificationModel.fromOld(tree, null, categoricalFeatures) } val uid = if (parent != null) parent.uid else Identifiable.randomUID("rfc") - new RandomForestClassificationModel(uid, newTrees, numFeatures, numClasses) + new RandomForestClassificationModel(uid, newTrees, -1, numClasses) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala deleted file mode 100644 index 5e4061fba5494..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.feature - -import org.apache.spark.annotation.Experimental -import org.apache.spark.ml._ -import org.apache.spark.ml.attribute.{AttributeGroup, _} -import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.ml.util.SchemaUtils -import org.apache.spark.mllib.feature -import org.apache.spark.mllib.linalg.{Vector, VectorUDT} -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{DoubleType, StructField, StructType} - -/** - * Params for [[ChiSqSelector]] and [[ChiSqSelectorModel]]. - */ -private[feature] trait ChiSqSelectorParams extends Params - with HasFeaturesCol with HasOutputCol with HasLabelCol { - - /** - * Number of features that selector will select (ordered by statistic value descending). If the - * number of features is < numTopFeatures, then this will select all features. The default value - * of numTopFeatures is 50. - * @group param - */ - final val numTopFeatures = new IntParam(this, "numTopFeatures", - "Number of features that selector will select, ordered by statistics value descending. If the" + - " number of features is < numTopFeatures, then this will select all features.", - ParamValidators.gtEq(1)) - setDefault(numTopFeatures -> 50) - - /** @group getParam */ - def getNumTopFeatures: Int = $(numTopFeatures) -} - -/** - * :: Experimental :: - * Chi-Squared feature selection, which selects categorical features to use for predicting a - * categorical label. - */ -@Experimental -final class ChiSqSelector(override val uid: String) - extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams { - - def this() = this(Identifiable.randomUID("chiSqSelector")) - - /** @group setParam */ - def setNumTopFeatures(value: Int): this.type = set(numTopFeatures, value) - - /** @group setParam */ - def setFeaturesCol(value: String): this.type = set(featuresCol, value) - - /** @group setParam */ - def setOutputCol(value: String): this.type = set(outputCol, value) - - /** @group setParam */ - def setLabelCol(value: String): this.type = set(labelCol, value) - - override def fit(dataset: DataFrame): ChiSqSelectorModel = { - transformSchema(dataset.schema, logging = true) - val input = dataset.select($(labelCol), $(featuresCol)).map { - case Row(label: Double, features: Vector) => - LabeledPoint(label, features) - } - val chiSqSelector = new feature.ChiSqSelector($(numTopFeatures)).fit(input) - copyValues(new ChiSqSelectorModel(uid, chiSqSelector).setParent(this)) - } - - override def transformSchema(schema: StructType): StructType = { - SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT) - SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) - SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT) - } - - override def copy(extra: ParamMap): ChiSqSelector = defaultCopy(extra) -} - -/** - * :: Experimental :: - * Model fitted by [[ChiSqSelector]]. - */ -@Experimental -final class ChiSqSelectorModel private[ml] ( - override val uid: String, - private val chiSqSelector: feature.ChiSqSelectorModel) - extends Model[ChiSqSelectorModel] with ChiSqSelectorParams { - - /** @group setParam */ - def setFeaturesCol(value: String): this.type = set(featuresCol, value) - - /** @group setParam */ - def setOutputCol(value: String): this.type = set(outputCol, value) - - /** @group setParam */ - def setLabelCol(value: String): this.type = set(labelCol, value) - - override def transform(dataset: DataFrame): DataFrame = { - val transformedSchema = transformSchema(dataset.schema, logging = true) - val newField = transformedSchema.last - val selector = udf { chiSqSelector.transform _ } - dataset.withColumn($(outputCol), selector(col($(featuresCol))), newField.metadata) - } - - override def transformSchema(schema: StructType): StructType = { - SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT) - val newField = prepOutputField(schema) - val outputFields = schema.fields :+ newField - StructType(outputFields) - } - - /** - * Prepare the output column field, including per-feature metadata. - */ - private def prepOutputField(schema: StructType): StructField = { - val selector = chiSqSelector.selectedFeatures.toSet - val origAttrGroup = AttributeGroup.fromStructField(schema($(featuresCol))) - val featureAttributes: Array[Attribute] = if (origAttrGroup.attributes.nonEmpty) { - origAttrGroup.attributes.get.zipWithIndex.filter(x => selector.contains(x._2)).map(_._1) - } else { - Array.fill[Attribute](selector.size)(NominalAttribute.defaultAttr) - } - val newAttributeGroup = new AttributeGroup($(outputCol), featureAttributes) - newAttributeGroup.toStructField() - } - - override def copy(extra: ParamMap): ChiSqSelectorModel = { - val copied = new ChiSqSelectorModel(uid, chiSqSelector) - copyValues(copied, extra).setParent(parent) - } -} diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala index 37f7862476cfe..9194763fb32f5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala @@ -149,14 +149,8 @@ class Interaction(override val uid: String) extends Transformer features.reverse.foreach { f => val encodedAttrs = f.dataType match { case _: NumericType | BooleanType => - val attr = Attribute.decodeStructField(f, preserveName = true) - if (attr == UnresolvedAttribute) { - encodedFeatureAttrs(Seq(NumericAttribute.defaultAttr.withName(f.name)), None) - } else if (!attr.name.isDefined) { - encodedFeatureAttrs(Seq(attr.withName(f.name)), None) - } else { - encodedFeatureAttrs(Seq(attr), None) - } + val attr = Attribute.fromStructField(f) + encodedFeatureAttrs(Seq(attr), None) case _: VectorUDT => val group = AttributeGroup.fromStructField(f) encodedFeatureAttrs(group.attributes.get, Some(group.name)) @@ -227,7 +221,7 @@ class Interaction(override val uid: String) extends Transformer * count is equal to the number of categories. For numeric features the count * should be set to 1. */ -private[ml] class FeatureEncoder(numFeatures: Array[Int]) extends Serializable { +private[ml] class FeatureEncoder(numFeatures: Array[Int]) { assert(numFeatures.forall(_ > 0), "Features counts must all be positive.") /** The size of the output vector. */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala deleted file mode 100644 index 46b836da9cfde..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.feature - -import scala.collection.mutable - -import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental -import org.apache.spark.ml._ -import org.apache.spark.ml.attribute.NominalAttribute -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} -import org.apache.spark.ml.param.{IntParam, _} -import org.apache.spark.ml.util._ -import org.apache.spark.sql.types.{DoubleType, StructType} -import org.apache.spark.sql.{DataFrame, Row} -import org.apache.spark.util.random.XORShiftRandom - -/** - * Params for [[QuantileDiscretizer]]. - */ -private[feature] trait QuantileDiscretizerBase extends Params with HasInputCol with HasOutputCol { - - /** - * Maximum number of buckets (quantiles, or categories) into which data points are grouped. Must - * be >= 2. - * default: 2 - * @group param - */ - val numBuckets = new IntParam(this, "numBuckets", "Maximum number of buckets (quantiles, or " + - "categories) into which data points are grouped. Must be >= 2.", - ParamValidators.gtEq(2)) - setDefault(numBuckets -> 2) - - /** @group getParam */ - def getNumBuckets: Int = getOrDefault(numBuckets) -} - -/** - * :: Experimental :: - * `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned - * categorical features. The bin ranges are chosen by taking a sample of the data and dividing it - * into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity, - * covering all real values. This attempts to find numBuckets partitions based on a sample of data, - * but it may find fewer depending on the data sample values. - */ -@Experimental -final class QuantileDiscretizer(override val uid: String) - extends Estimator[Bucketizer] with QuantileDiscretizerBase { - - def this() = this(Identifiable.randomUID("quantileDiscretizer")) - - /** @group setParam */ - def setNumBuckets(value: Int): this.type = set(numBuckets, value) - - /** @group setParam */ - def setInputCol(value: String): this.type = set(inputCol, value) - - /** @group setParam */ - def setOutputCol(value: String): this.type = set(outputCol, value) - - override def transformSchema(schema: StructType): StructType = { - SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) - val inputFields = schema.fields - require(inputFields.forall(_.name != $(outputCol)), - s"Output column ${$(outputCol)} already exists.") - val attr = NominalAttribute.defaultAttr.withName($(outputCol)) - val outputFields = inputFields :+ attr.toStructField() - StructType(outputFields) - } - - override def fit(dataset: DataFrame): Bucketizer = { - val samples = QuantileDiscretizer.getSampledInput(dataset.select($(inputCol)), $(numBuckets)) - .map { case Row(feature: Double) => feature } - val candidates = QuantileDiscretizer.findSplitCandidates(samples, $(numBuckets) - 1) - val splits = QuantileDiscretizer.getSplits(candidates) - val bucketizer = new Bucketizer(uid).setSplits(splits) - copyValues(bucketizer) - } - - override def copy(extra: ParamMap): QuantileDiscretizer = defaultCopy(extra) -} - -private[feature] object QuantileDiscretizer extends Logging { - /** - * Sampling from the given dataset to collect quantile statistics. - */ - def getSampledInput(dataset: DataFrame, numBins: Int): Array[Row] = { - val totalSamples = dataset.count() - require(totalSamples > 0, - "QuantileDiscretizer requires non-empty input dataset but was given an empty input.") - val requiredSamples = math.max(numBins * numBins, 10000) - val fraction = math.min(requiredSamples / dataset.count(), 1.0) - dataset.sample(withReplacement = false, fraction, new XORShiftRandom().nextInt()).collect() - } - - /** - * Compute split points with respect to the sample distribution. - */ - def findSplitCandidates(samples: Array[Double], numSplits: Int): Array[Double] = { - val valueCountMap = samples.foldLeft(Map.empty[Double, Int]) { (m, x) => - m + ((x, m.getOrElse(x, 0) + 1)) - } - val valueCounts = valueCountMap.toSeq.sortBy(_._1).toArray ++ Array((Double.MaxValue, 1)) - val possibleSplits = valueCounts.length - 1 - if (possibleSplits <= numSplits) { - valueCounts.dropRight(1).map(_._1) - } else { - val stride: Double = math.ceil(samples.length.toDouble / (numSplits + 1)) - val splitsBuilder = mutable.ArrayBuilder.make[Double] - var index = 1 - // currentCount: sum of counts of values that have been visited - var currentCount = valueCounts(0)._2 - // targetCount: target value for `currentCount`. If `currentCount` is closest value to - // `targetCount`, then current value is a split threshold. After finding a split threshold, - // `targetCount` is added by stride. - var targetCount = stride - while (index < valueCounts.length) { - val previousCount = currentCount - currentCount += valueCounts(index)._2 - val previousGap = math.abs(previousCount - targetCount) - val currentGap = math.abs(currentCount - targetCount) - // If adding count of current value to currentCount makes the gap between currentCount and - // targetCount smaller, previous value is a split threshold. - if (previousGap < currentGap) { - splitsBuilder += valueCounts(index - 1)._1 - targetCount += stride - } - index += 1 - } - splitsBuilder.result() - } - } - - /** - * Adjust split candidates to proper splits by: adding positive/negative infinity to both sides as - * needed, and adding a default split value of 0 if no good candidates are found. - */ - def getSplits(candidates: Array[Double]): Array[Double] = { - val effectiveValues = if (candidates.size != 0) { - if (candidates.head == Double.NegativeInfinity - && candidates.last == Double.PositiveInfinity) { - candidates.drop(1).dropRight(1) - } else if (candidates.head == Double.NegativeInfinity) { - candidates.drop(1) - } else if (candidates.last == Double.PositiveInfinity) { - candidates.dropRight(1) - } else { - candidates - } - } else { - candidates - } - - if (effectiveValues.size == 0) { - Array(Double.NegativeInfinity, 0, Double.PositiveInfinity) - } else { - Array(Double.NegativeInfinity) ++ effectiveValues ++ Array(Double.PositiveInfinity) - } - } -} - diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 5c43a41bee3b4..e196ebd62792e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -21,7 +21,6 @@ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.annotation.Experimental -import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol} @@ -43,8 +42,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { /** * :: Experimental :: * Implements the transforms required for fitting a dataset against an R model formula. Currently - * we support a limited subset of the R operators, including '~', '.', ':', '+', and '-'. Also see - * the R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html + * we support a limited subset of the R operators, including '.', '~', '+', and '-'. Also see the + * R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html */ @Experimental class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase { @@ -83,54 +82,36 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R require(isDefined(formula), "Formula must be defined first.") val parsedFormula = RFormulaParser.parse($(formula)) val resolvedFormula = parsedFormula.resolve(dataset.schema) + // StringType terms and terms representing interactions need to be encoded before assembly. + // TODO(ekl) add support for feature interactions val encoderStages = ArrayBuffer[PipelineStage]() - - val prefixesToRewrite = mutable.Map[String, String]() val tempColumns = ArrayBuffer[String]() - def tmpColumn(category: String): String = { - val col = Identifiable.randomUID(category) - tempColumns += col - col - } - - // First we index each string column referenced by the input terms. - val indexed: Map[String, String] = resolvedFormula.terms.flatten.distinct.map { term => + val takenNames = mutable.Set(dataset.columns: _*) + val encodedTerms = resolvedFormula.terms.map { term => dataset.schema(term) match { case column if column.dataType == StringType => - val indexCol = tmpColumn("stridx") - encoderStages += new StringIndexer() - .setInputCol(term) - .setOutputCol(indexCol) - (term, indexCol) + val indexCol = term + "_idx_" + uid + val encodedCol = { + var tmp = term + while (takenNames.contains(tmp)) { + tmp += "_" + } + tmp + } + takenNames.add(indexCol) + takenNames.add(encodedCol) + encoderStages += new StringIndexer().setInputCol(term).setOutputCol(indexCol) + encoderStages += new OneHotEncoder().setInputCol(indexCol).setOutputCol(encodedCol) + tempColumns += indexCol + tempColumns += encodedCol + encodedCol case _ => - (term, term) + term } - }.toMap - - // Then we handle one-hot encoding and interactions between terms. - val encodedTerms = resolvedFormula.terms.map { - case Seq(term) if dataset.schema(term).dataType == StringType => - val encodedCol = tmpColumn("onehot") - encoderStages += new OneHotEncoder() - .setInputCol(indexed(term)) - .setOutputCol(encodedCol) - prefixesToRewrite(encodedCol + "_") = term + "_" - encodedCol - case Seq(term) => - term - case terms => - val interactionCol = tmpColumn("interaction") - encoderStages += new Interaction() - .setInputCols(terms.map(indexed).toArray) - .setOutputCol(interactionCol) - prefixesToRewrite(interactionCol + "_") = "" - interactionCol } - encoderStages += new VectorAssembler(uid) .setInputCols(encodedTerms.toArray) .setOutputCol($(featuresCol)) - encoderStages += new VectorAttributeRewriter($(featuresCol), prefixesToRewrite.toMap) encoderStages += new ColumnPruner(tempColumns.toSet) if (dataset.schema.fieldNames.contains(resolvedFormula.label) && @@ -245,53 +226,3 @@ private class ColumnPruner(columnsToPrune: Set[String]) extends Transformer { override def copy(extra: ParamMap): ColumnPruner = defaultCopy(extra) } - -/** - * Utility transformer that rewrites Vector attribute names via prefix replacement. For example, - * it can rewrite attribute names starting with 'foo_' to start with 'bar_' instead. - * - * @param vectorCol name of the vector column to rewrite. - * @param prefixesToRewrite the map of string prefixes to their replacement values. Each attribute - * name defined in vectorCol will be checked against the keys of this - * map. When a key prefixes a name, the matching prefix will be replaced - * by the value in the map. - */ -private class VectorAttributeRewriter( - vectorCol: String, - prefixesToRewrite: Map[String, String]) - extends Transformer { - - override val uid = Identifiable.randomUID("vectorAttrRewriter") - - override def transform(dataset: DataFrame): DataFrame = { - val metadata = { - val group = AttributeGroup.fromStructField(dataset.schema(vectorCol)) - val attrs = group.attributes.get.map { attr => - if (attr.name.isDefined) { - val name = attr.name.get - val replacement = prefixesToRewrite.filter { case (k, _) => name.startsWith(k) } - if (replacement.nonEmpty) { - val (k, v) = replacement.headOption.get - attr.withName(v + name.stripPrefix(k)) - } else { - attr - } - } else { - attr - } - } - new AttributeGroup(vectorCol, attrs).toMetadata() - } - val otherCols = dataset.columns.filter(_ != vectorCol).map(dataset.col) - val rewrittenCol = dataset.col(vectorCol).as(vectorCol, metadata) - dataset.select((otherCols :+ rewrittenCol): _*) - } - - override def transformSchema(schema: StructType): StructType = { - StructType( - schema.fields.filter(_.name != vectorCol) ++ - schema.fields.filter(_.name == vectorCol)) - } - - override def copy(extra: ParamMap): VectorAttributeRewriter = defaultCopy(extra) -} diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala index 4079b387e1834..1ca3b92a7d92a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala @@ -17,7 +17,6 @@ package org.apache.spark.ml.feature -import scala.collection.mutable import scala.util.parsing.combinator.RegexParsers import org.apache.spark.mllib.linalg.VectorUDT @@ -32,35 +31,27 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) { * of the special '.' term. Duplicate terms will be removed during resolution. */ def resolve(schema: StructType): ResolvedRFormula = { - val dotTerms = expandDot(schema) - var includedTerms = Seq[Seq[String]]() + var includedTerms = Seq[String]() terms.foreach { - case col: ColumnRef => - includedTerms :+= Seq(col.value) - case ColumnInteraction(cols) => - includedTerms ++= expandInteraction(schema, cols) case Dot => - includedTerms ++= dotTerms.map(Seq(_)) + includedTerms ++= simpleTypes(schema).filter(_ != label.value) + case ColumnRef(value) => + includedTerms :+= value case Deletion(term: Term) => term match { - case inner: ColumnRef => - includedTerms = includedTerms.filter(_ != Seq(inner.value)) - case ColumnInteraction(cols) => - val fromInteraction = expandInteraction(schema, cols).map(_.toSet) - includedTerms = includedTerms.filter(t => !fromInteraction.contains(t.toSet)) + case ColumnRef(value) => + includedTerms = includedTerms.filter(_ != value) case Dot => // e.g. "- .", which removes all first-order terms - includedTerms = includedTerms.filter { - case Seq(t) => !dotTerms.contains(t) - case _ => true - } + val fromSchema = simpleTypes(schema) + includedTerms = includedTerms.filter(fromSchema.contains(_)) case _: Deletion => - throw new RuntimeException("Deletion terms cannot be nested") + assert(false, "Deletion terms cannot be nested") case _: Intercept => } case _: Intercept => } - ResolvedRFormula(label.value, includedTerms.distinct, hasIntercept) + ResolvedRFormula(label.value, includedTerms.distinct) } /** Whether this formula specifies fitting with an intercept term. */ @@ -76,54 +67,19 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) { intercept } - // expands the Dot operators in interaction terms - private def expandInteraction( - schema: StructType, terms: Seq[InteractableTerm]): Seq[Seq[String]] = { - if (terms.isEmpty) { - return Seq(Nil) - } - - val rest = expandInteraction(schema, terms.tail) - val validInteractions = (terms.head match { - case Dot => - expandDot(schema).flatMap { t => - rest.map { r => - Seq(t) ++ r - } - } - case ColumnRef(value) => - rest.map(Seq(value) ++ _) - }).map(_.distinct) - - // Deduplicates feature interactions, for example, a:b is the same as b:a. - var seen = mutable.Set[Set[String]]() - validInteractions.flatMap { - case t if seen.contains(t.toSet) => - None - case t => - seen += t.toSet - Some(t) - }.sortBy(_.length) - } - // the dot operator excludes complex column types - private def expandDot(schema: StructType): Seq[String] = { + private def simpleTypes(schema: StructType): Seq[String] = { schema.fields.filter(_.dataType match { case _: NumericType | StringType | BooleanType | _: VectorUDT => true case _ => false - }).map(_.name).filter(_ != label.value) + }).map(_.name) } } /** * Represents a fully evaluated and simplified R formula. - * @param label the column name of the R formula label (response variable). - * @param terms the simplified terms of the R formula. Interactions terms are represented as Seqs - * of column names; non-interaction terms as length 1 Seqs. - * @param hasIntercept whether the formula specifies fitting with an intercept. */ -private[ml] case class ResolvedRFormula( - label: String, terms: Seq[Seq[String]], hasIntercept: Boolean) +private[ml] case class ResolvedRFormula(label: String, terms: Seq[String]) /** * R formula terms. See the R formula docs here for more information: @@ -131,17 +87,11 @@ private[ml] case class ResolvedRFormula( */ private[ml] sealed trait Term -/** A term that may be part of an interaction, e.g. 'x' in 'x:y' */ -private[ml] sealed trait InteractableTerm extends Term - /* R formula reference to all available columns, e.g. "." in a formula */ -private[ml] case object Dot extends InteractableTerm +private[ml] case object Dot extends Term /* R formula reference to a column, e.g. "+ Species" in a formula */ -private[ml] case class ColumnRef(value: String) extends InteractableTerm - -/* R formula interaction of several columns, e.g. "Sepal_Length:Species" in a formula */ -private[ml] case class ColumnInteraction(terms: Seq[InteractableTerm]) extends Term +private[ml] case class ColumnRef(value: String) extends Term /* R formula intercept toggle, e.g. "+ 0" in a formula */ private[ml] case class Intercept(enabled: Boolean) extends Term @@ -150,30 +100,25 @@ private[ml] case class Intercept(enabled: Boolean) extends Term private[ml] case class Deletion(term: Term) extends Term /** - * Limited implementation of R formula parsing. Currently supports: '~', '+', '-', '.', ':'. + * Limited implementation of R formula parsing. Currently supports: '~', '+', '-', '.'. */ private[ml] object RFormulaParser extends RegexParsers { - private val intercept: Parser[Intercept] = + def intercept: Parser[Intercept] = "([01])".r ^^ { case a => Intercept(a == "1") } - private val columnRef: Parser[ColumnRef] = + def columnRef: Parser[ColumnRef] = "([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r ^^ { case a => ColumnRef(a) } - private val dot: Parser[InteractableTerm] = "\\.".r ^^ { case _ => Dot } - - private val interaction: Parser[List[InteractableTerm]] = rep1sep(columnRef | dot, ":") - - private val term: Parser[Term] = intercept | - interaction ^^ { case terms => ColumnInteraction(terms) } | dot | columnRef + def term: Parser[Term] = intercept | columnRef | "\\.".r ^^ { case _ => Dot } - private val terms: Parser[List[Term]] = (term ~ rep("+" ~ term | "-" ~ term)) ^^ { + def terms: Parser[List[Term]] = (term ~ rep("+" ~ term | "-" ~ term)) ^^ { case op ~ list => list.foldLeft(List(op)) { case (left, "+" ~ right) => left ++ Seq(right) case (left, "-" ~ right) => left ++ Seq(Deletion(right)) } } - private val formula: Parser[ParsedRFormula] = + def formula: Parser[ParsedRFormula] = (columnRef ~ "~" ~ terms) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t) } def parse(value: String): ParsedRFormula = parseAll(formula, value) match { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 486274cd75a14..2b1592930e77b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -147,8 +147,9 @@ class StringIndexerModel ( } } + val outputColName = $(outputCol) val metadata = NominalAttribute.defaultAttr - .withName($(inputCol)).withValues(labels).toMetadata() + .withName(outputColName).withValues(labels).toMetadata() // If we are skipping invalid records, filter them out. val filteredDataset = (getHandleInvalid) match { case "skip" => { @@ -160,7 +161,7 @@ class StringIndexerModel ( case _ => dataset } filteredDataset.select(col("*"), - indexer(dataset($(inputCol)).cast(StringType)).as($(outputCol), metadata)) + indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index c7bca1243092c..0087543d6d42c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -56,9 +56,9 @@ private[shared] object SharedParamsCodeGen { ParamDesc[String]("inputCol", "input column name"), ParamDesc[Array[String]]("inputCols", "input column names"), ParamDesc[String]("outputCol", "output column name", Some("uid + \"__output\"")), - ParamDesc[Int]("checkpointInterval", "set checkpoint interval (>= 1) or " + - "disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed " + - "every 10 iterations", isValid = "(interval: Int) => interval == -1 || interval >= 1"), + ParamDesc[Int]("checkpointInterval", "checkpoint interval (>= 1). E.g. 10 means that " + + "the cache will get checkpointed every 10 iterations.", + isValid = "ParamValidators.gtEq(1)"), ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")), ParamDesc[String]("handleInvalid", "how to handle invalid entries. Options are skip (which " + "will filter out rows with bad values), or error (which will throw an errror). More " + diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index cb2a060a34dd6..cb06c16ae8c71 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -223,10 +223,10 @@ private[ml] trait HasOutputCol extends Params { private[ml] trait HasCheckpointInterval extends Params { /** - * Param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. + * Param for checkpoint interval (>= 1). E.g. 10 means that the cache will get checkpointed every 10 iterations.. * @group param */ - final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations", (interval: Int) => interval == -1 || interval >= 1) + final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "checkpoint interval (>= 1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", ParamValidators.gtEq(1)) /** @group getParam */ final def getCheckpointInterval: Int = $(checkpointInterval) diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 535f266b9a944..d1f6b6eed3676 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -314,9 +314,9 @@ class ALS(override val uid: String) extends Estimator[ALSModel] with ALSParams { override def fit(dataset: DataFrame): ALSModel = { import dataset.sqlContext.implicits._ - val r = if ($(ratingCol) != "") col($(ratingCol)).cast(FloatType) else lit(1.0f) val ratings = dataset - .select(col($(userCol)).cast(IntegerType), col($(itemCol)).cast(IntegerType), r) + .select(col($(userCol)).cast(IntegerType), col($(itemCol)).cast(IntegerType), + col($(ratingCol)).cast(FloatType)) .map { row => Rating(row.getInt(0), row.getInt(1), row.getFloat(2)) } @@ -555,7 +555,7 @@ object ALS extends Logging { var itemFactors = initialize(itemInBlocks, rank, seedGen.nextLong()) var previousCheckpointFile: Option[String] = None val shouldCheckpoint: Int => Boolean = (iter) => - sc.checkpointDir.isDefined && checkpointInterval != -1 && (iter % checkpointInterval == 0) + sc.checkpointDir.isDefined && (iter % checkpointInterval == 0) val deletePreviousCheckpointFile: () => Unit = () => previousCheckpointFile.foreach { file => try { diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index b7d095872ffa5..84f9d62b81929 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -41,7 +41,7 @@ import org.apache.spark.storage.StorageLevel */ private[regression] trait AFTSurvivalRegressionParams extends Params with HasFeaturesCol with HasLabelCol with HasPredictionCol with HasMaxIter - with HasTol with HasFitIntercept with Logging { + with HasTol with HasFitIntercept { /** * Param for censor column name. @@ -71,23 +71,10 @@ private[regression] trait AFTSurvivalRegressionParams extends Params /** @group getParam */ @Since("1.6.0") def getQuantileProbabilities: Array[Double] = $(quantileProbabilities) - setDefault(quantileProbabilities -> Array(0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99)) - /** - * Param for quantiles column name. - * This column will output quantiles of corresponding quantileProbabilities if it is set. - * @group param - */ - @Since("1.6.0") - final val quantilesCol: Param[String] = new Param(this, "quantilesCol", "quantiles column name") - - /** @group getParam */ - @Since("1.6.0") - def getQuantilesCol: String = $(quantilesCol) - - /** Checks whether the input has quantiles column name. */ - protected[regression] def hasQuantilesCol: Boolean = { - isDefined(quantilesCol) && $(quantilesCol) != "" + /** Checks whether the input has quantile probabilities array. */ + protected[regression] def hasQuantileProbabilities: Boolean = { + isDefined(quantileProbabilities) && $(quantileProbabilities).size != 0 } /** @@ -104,9 +91,6 @@ private[regression] trait AFTSurvivalRegressionParams extends Params SchemaUtils.checkColumnType(schema, $(censorCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) } - if (hasQuantilesCol) { - SchemaUtils.appendColumn(schema, $(quantilesCol), new VectorUDT) - } SchemaUtils.appendColumn(schema, $(predictionCol), DoubleType) } } @@ -141,14 +125,6 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S @Since("1.6.0") def setPredictionCol(value: String): this.type = set(predictionCol, value) - /** @group setParam */ - @Since("1.6.0") - def setQuantileProbabilities(value: Array[Double]): this.type = set(quantileProbabilities, value) - - /** @group setParam */ - @Since("1.6.0") - def setQuantilesCol(value: String): this.type = set(quantilesCol, value) - /** * Set if we should fit the intercept * Default is true. @@ -268,12 +244,10 @@ class AFTSurvivalRegressionModel private[ml] ( @Since("1.6.0") def setQuantileProbabilities(value: Array[Double]): this.type = set(quantileProbabilities, value) - /** @group setParam */ - @Since("1.6.0") - def setQuantilesCol(value: String): this.type = set(quantilesCol, value) - @Since("1.6.0") def predictQuantiles(features: Vector): Vector = { + require(hasQuantileProbabilities, + "AFTSurvivalRegressionModel predictQuantiles must set quantile probabilities array") // scale parameter for the Weibull distribution of lifetime val lambda = math.exp(BLAS.dot(coefficients, features) + intercept) // shape parameter for the Weibull distribution of lifetime @@ -293,13 +267,7 @@ class AFTSurvivalRegressionModel private[ml] ( override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema) val predictUDF = udf { features: Vector => predict(features) } - val predictQuantilesUDF = udf { features: Vector => predictQuantiles(features)} - if (hasQuantilesCol) { - dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol)))) - .withColumn($(quantilesCol), predictQuantilesUDF(col($(featuresCol)))) - } else { - dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol)))) - } + dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol)))) } @Since("1.6.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 477030d9ea3ee..1dd27ab455206 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -110,8 +110,7 @@ object DecisionTreeRegressor { @Experimental final class DecisionTreeRegressionModel private[ml] ( override val uid: String, - override val rootNode: Node, - override val numFeatures: Int) + override val rootNode: Node) extends PredictionModel[Vector, DecisionTreeRegressionModel] with DecisionTreeModel with Serializable { @@ -122,8 +121,7 @@ final class DecisionTreeRegressionModel private[ml] ( * Construct a decision tree regression model. * @param rootNode Root node of tree, with other nodes attached. */ - private[ml] def this(rootNode: Node, numFeatures: Int) = - this(Identifiable.randomUID("dtr"), rootNode, numFeatures) + private[ml] def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode) override protected def predict(features: Vector): Double = { rootNode.predictImpl(features).prediction @@ -131,7 +129,7 @@ final class DecisionTreeRegressionModel private[ml] ( @Since("1.4.0") override def copy(extra: ParamMap): DecisionTreeRegressionModel = { - copyValues(new DecisionTreeRegressionModel(uid, rootNode, numFeatures), extra).setParent(parent) + copyValues(new DecisionTreeRegressionModel(uid, rootNode), extra).setParent(parent) } @Since("1.4.0") @@ -151,13 +149,12 @@ private[ml] object DecisionTreeRegressionModel { def fromOld( oldModel: OldDecisionTreeModel, parent: DecisionTreeRegressor, - categoricalFeatures: Map[Int, Int], - numFeatures: Int = -1): DecisionTreeRegressionModel = { + categoricalFeatures: Map[Int, Int]): DecisionTreeRegressionModel = { require(oldModel.algo == OldAlgo.Regression, s"Cannot convert non-regression DecisionTreeModel (old API) to" + s" DecisionTreeRegressionModel (new API). Algo is: ${oldModel.algo}") val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures) val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtr") - new DecisionTreeRegressionModel(uid, rootNode, numFeatures) + new DecisionTreeRegressionModel(uid, rootNode) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index 07144cc7cfbd7..61f68c7bbc9ec 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -142,11 +142,10 @@ final class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: Stri val categoricalFeatures: Map[Int, Int] = MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol))) val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset) - val numFeatures = oldDataset.first().features.size val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Regression) val oldGBT = new OldGBT(boostingStrategy) val oldModel = oldGBT.run(oldDataset) - GBTRegressionModel.fromOld(oldModel, this, categoricalFeatures, numFeatures) + GBTRegressionModel.fromOld(oldModel, this, categoricalFeatures) } @Since("1.4.0") @@ -173,11 +172,10 @@ object GBTRegressor { */ @Since("1.4.0") @Experimental -final class GBTRegressionModel private[ml]( +final class GBTRegressionModel( override val uid: String, private val _trees: Array[DecisionTreeRegressionModel], - private val _treeWeights: Array[Double], - override val numFeatures: Int) + private val _treeWeights: Array[Double]) extends PredictionModel[Vector, GBTRegressionModel] with TreeEnsembleModel with Serializable { @@ -217,8 +215,7 @@ final class GBTRegressionModel private[ml]( @Since("1.4.0") override def copy(extra: ParamMap): GBTRegressionModel = { - copyValues(new GBTRegressionModel(uid, _trees, _treeWeights, numFeatures), - extra).setParent(parent) + copyValues(new GBTRegressionModel(uid, _trees, _treeWeights), extra).setParent(parent) } @Since("1.4.0") @@ -238,8 +235,7 @@ private[ml] object GBTRegressionModel { def fromOld( oldModel: OldGBTModel, parent: GBTRegressor, - categoricalFeatures: Map[Int, Int], - numFeatures: Int = -1): GBTRegressionModel = { + categoricalFeatures: Map[Int, Int]): GBTRegressionModel = { require(oldModel.algo == OldAlgo.Regression, "Cannot convert GradientBoostedTreesModel" + s" with algo=${oldModel.algo} (old API) to GBTRegressionModel (new API).") val newTrees = oldModel.trees.map { tree => @@ -247,6 +243,6 @@ private[ml] object GBTRegressionModel { DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures) } val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtr") - new GBTRegressionModel(parent.uid, newTrees, oldModel.treeWeights, numFeatures) + new GBTRegressionModel(parent.uid, newTrees, oldModel.treeWeights) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index 71e40b513ee0a..5a05f80b312a1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -131,7 +131,7 @@ object RandomForestRegressor { final class RandomForestRegressionModel private[ml] ( override val uid: String, private val _trees: Array[DecisionTreeRegressionModel], - override val numFeatures: Int) + val numFeatures: Int) extends PredictionModel[Vector, RandomForestRegressionModel] with TreeEnsembleModel with Serializable { @@ -207,14 +207,13 @@ private[ml] object RandomForestRegressionModel { def fromOld( oldModel: OldRandomForestModel, parent: RandomForestRegressor, - categoricalFeatures: Map[Int, Int], - numFeatures: Int = -1): RandomForestRegressionModel = { + categoricalFeatures: Map[Int, Int]): RandomForestRegressionModel = { require(oldModel.algo == OldAlgo.Regression, "Cannot convert RandomForestModel" + s" with algo=${oldModel.algo} (old API) to RandomForestRegressionModel (new API).") val newTrees = oldModel.trees.map { tree => // parent for each tree is null since there is no good way to set this. DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures) } - new RandomForestRegressionModel(parent.uid, newTrees, numFeatures) + new RandomForestRegressionModel(parent.uid, newTrees, -1) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala index 1ee01131d6334..c5ad8df73fac9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala @@ -122,7 +122,7 @@ private[spark] class NodeIdCache( rddUpdateCount += 1 // Handle checkpointing if the directory is not None. - if (canCheckpoint && checkpointInterval != -1 && (rddUpdateCount % checkpointInterval) == 0) { + if (canCheckpoint && (rddUpdateCount % checkpointInterval) == 0) { // Let's see if we can delete previous checkpoints. var canDelete = true while (checkpointQueue.size > 1 && canDelete) { diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala index 4a3b12d1440b8..72890f56449af 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala @@ -179,28 +179,22 @@ private[ml] object RandomForest extends Logging { } } - val numFeatures = metadata.numFeatures - parentUID match { case Some(uid) => if (strategy.algo == OldAlgo.Classification) { topNodes.map { rootNode => - new DecisionTreeClassificationModel(uid, rootNode.toNode, numFeatures, - strategy.getNumClasses) + new DecisionTreeClassificationModel(uid, rootNode.toNode, strategy.getNumClasses) } } else { - topNodes.map { rootNode => - new DecisionTreeRegressionModel(uid, rootNode.toNode, numFeatures) - } + topNodes.map(rootNode => new DecisionTreeRegressionModel(uid, rootNode.toNode)) } case None => if (strategy.algo == OldAlgo.Classification) { topNodes.map { rootNode => - new DecisionTreeClassificationModel(rootNode.toNode, numFeatures, - strategy.getNumClasses) + new DecisionTreeClassificationModel(rootNode.toNode, strategy.getNumClasses) } } else { - topNodes.map(rootNode => new DecisionTreeRegressionModel(rootNode.toNode, numFeatures)) + topNodes.map(rootNode => new DecisionTreeRegressionModel(rootNode.toNode)) } } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 1da97db9277d8..b755522b1fd97 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -17,6 +17,7 @@ package org.apache.spark.ml.tree +import org.apache.spark.ml.classification.ClassifierParams import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ @@ -87,8 +88,7 @@ private[ml] trait DecisionTreeParams extends PredictorParams /** * If false, the algorithm will pass trees to executors to match instances with nodes. * If true, the algorithm will cache node IDs for each instance. - * Caching can speed up training of deeper trees. Users can set how often should the - * cache be checkpointed or disable it by setting checkpointInterval. + * Caching can speed up training of deeper trees. * (default = false) * @group expertParam */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index 77d9948ed86b9..0679bfd0f3ffe 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -136,10 +136,6 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM /** * :: Experimental :: * Model from k-fold cross validation. - * - * @param bestModel The best model selected from k-fold cross validation. - * @param avgMetrics Average cross-validation metrics for each paramMap in - * [[estimatorParamMaps]], in the corresponding order. */ @Experimental class CrossValidatorModel private[ml] ( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index d4d022afde051..f82ed3fe9ea68 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -174,8 +174,6 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { * Creates a ChiSquared feature selector. * @param numTopFeatures number of features that selector will select * (ordered by statistic value descending) - * Note that if the number of features is < numTopFeatures, then this will - * select all features. */ @Since("1.3.0") class ChiSqSelector @Since("1.3.0") ( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index 07eb750b06a3b..95c688c86a7e4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -142,10 +142,5 @@ object AssociationRules { def javaConsequent: java.util.List[Item] = { consequent.toList.asJava } - - override def toString: String = { - s"${antecedent.mkString("{", ",", "}")} => " + - s"${consequent.mkString("{", ",", "}")}: ${confidence}" - } } } diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java index f5f690eabd12c..075a62c493f17 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java @@ -19,7 +19,6 @@ import java.io.Serializable; import java.util.Arrays; -import java.util.List; import org.junit.After; import org.junit.Before; @@ -76,20 +75,21 @@ public void naiveBayesDefaultParams() { @Test public void testNaiveBayes() { - List data = Arrays.asList( + JavaRDD jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(0.0, Vectors.dense(1.0, 0.0, 0.0)), RowFactory.create(0.0, Vectors.dense(2.0, 0.0, 0.0)), RowFactory.create(1.0, Vectors.dense(0.0, 1.0, 0.0)), RowFactory.create(1.0, Vectors.dense(0.0, 2.0, 0.0)), RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 1.0)), - RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0))); + RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0)) + )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); - DataFrame dataset = jsql.createDataFrame(data, schema); + DataFrame dataset = jsql.createDataFrame(jrdd, schema); NaiveBayes nb = new NaiveBayes().setSmoothing(0.5).setModelType("multinomial"); NaiveBayesModel model = nb.fit(dataset); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java index 8a1e5ef015659..47d68de599da2 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java @@ -55,16 +55,16 @@ public void tearDown() { public void bucketizerTest() { double[] splits = {-0.5, 0.0, 0.5}; + JavaRDD data = jsc.parallelize(Arrays.asList( + RowFactory.create(-0.5), + RowFactory.create(-0.3), + RowFactory.create(0.0), + RowFactory.create(0.2) + )); StructType schema = new StructType(new StructField[] { new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); - DataFrame dataset = jsql.createDataFrame( - Arrays.asList( - RowFactory.create(-0.5), - RowFactory.create(-0.3), - RowFactory.create(0.0), - RowFactory.create(0.2)), - schema); + DataFrame dataset = jsql.createDataFrame(data, schema); Bucketizer bucketizer = new Bucketizer() .setInputCol("feature") diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java index 39da47381b129..0f6ec64d97d36 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java @@ -57,11 +57,12 @@ public void tearDown() { @Test public void javaCompatibilityTest() { double[] input = new double[] {1D, 2D, 3D, 4D}; - DataFrame dataset = jsql.createDataFrame( - Arrays.asList(RowFactory.create(Vectors.dense(input))), - new StructType(new StructField[]{ - new StructField("vec", (new VectorUDT()), false, Metadata.empty()) - })); + JavaRDD data = jsc.parallelize(Arrays.asList( + RowFactory.create(Vectors.dense(input)) + )); + DataFrame dataset = jsql.createDataFrame(data, new StructType(new StructField[]{ + new StructField("vec", (new VectorUDT()), false, Metadata.empty()) + })); double[] expectedResult = input.clone(); (new DoubleDCT_1D(input.length)).forward(expectedResult, true); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java index d12332c2a02a3..03dd5369bddf7 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java @@ -18,7 +18,6 @@ package org.apache.spark.ml.feature; import java.util.Arrays; -import java.util.List; import org.junit.After; import org.junit.Assert; @@ -56,17 +55,17 @@ public void tearDown() { @Test public void hashingTF() { - List data = Arrays.asList( + JavaRDD jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(0.0, "Hi I heard about Spark"), RowFactory.create(0.0, "I wish Java could use case classes"), RowFactory.create(1.0, "Logistic regression models are neat") - ); + )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); - DataFrame sentenceData = jsql.createDataFrame(data, schema); + DataFrame sentenceData = jsql.createDataFrame(jrdd, schema); Tokenizer tokenizer = new Tokenizer() .setInputCol("sentence") .setOutputCol("words"); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java index bf8eefd71905c..834fedbb59e1b 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java @@ -18,7 +18,6 @@ package org.apache.spark.ml.feature; import java.util.Arrays; -import java.util.List; import org.junit.After; import org.junit.Assert; @@ -61,7 +60,7 @@ public void polynomialExpansionTest() { .setOutputCol("polyFeatures") .setDegree(3); - List data = Arrays.asList( + JavaRDD data = jsc.parallelize(Arrays.asList( RowFactory.create( Vectors.dense(-2.0, 2.3), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17) @@ -71,7 +70,7 @@ public void polynomialExpansionTest() { Vectors.dense(0.6, -1.1), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331) ) - ); + )); StructType schema = new StructType(new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java index 848d9f8aa9288..76cdd0fae84ab 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java @@ -18,7 +18,6 @@ package org.apache.spark.ml.feature; import java.util.Arrays; -import java.util.List; import org.junit.After; import org.junit.Before; @@ -59,14 +58,14 @@ public void javaCompatibilityTest() { .setInputCol("raw") .setOutputCol("filtered"); - List data = Arrays.asList( + JavaRDD rdd = jsc.parallelize(Arrays.asList( RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) - ); + )); StructType schema = new StructType(new StructField[] { new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); - DataFrame dataset = jsql.createDataFrame(data, schema); + DataFrame dataset = jsql.createDataFrame(rdd, schema); remover.transform(dataset).collect(); } diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java index 6b2c48ef1c342..35b18c5308f61 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java @@ -18,7 +18,6 @@ package org.apache.spark.ml.feature; import java.util.Arrays; -import java.util.List; import org.junit.After; import org.junit.Assert; @@ -57,9 +56,9 @@ public void testStringIndexer() { createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); - List data = Arrays.asList( - c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c")); - DataFrame dataset = sqlContext.createDataFrame(data, schema); + JavaRDD rdd = jsc.parallelize( + Arrays.asList(c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c"))); + DataFrame dataset = sqlContext.createDataFrame(rdd, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java index e283777570930..b7c564caad3bd 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java @@ -65,7 +65,8 @@ public void testVectorAssembler() { Row row = RowFactory.create( 0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, new int[] {1}, new double[] {3.0}), 10L); - DataFrame dataset = sqlContext.createDataFrame(Arrays.asList(row), schema); + JavaRDD rdd = jsc.parallelize(Arrays.asList(row)); + DataFrame dataset = sqlContext.createDataFrame(rdd, schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[] {"x", "y", "z", "n"}) .setOutputCol("features"); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java index 00174e6a683d6..f953361427586 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java @@ -18,7 +18,6 @@ package org.apache.spark.ml.feature; import java.util.Arrays; -import java.util.List; import org.junit.After; import org.junit.Assert; @@ -64,12 +63,12 @@ public void vectorSlice() { }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); - List data = Arrays.asList( + JavaRDD jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) - ); + )); - DataFrame dataset = jsql.createDataFrame(data, (new StructType()).add(group.toStructField())); + DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer() .setInputCol("userFeatures").setOutputCol("features"); diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java index 0c0c1c4d12d0f..70f5ad9432212 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java @@ -51,15 +51,15 @@ public void tearDown() { @Test public void testJavaWord2Vec() { + JavaRDD jrdd = jsc.parallelize(Arrays.asList( + RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), + RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), + RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))) + )); StructType schema = new StructType(new StructField[]{ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); - DataFrame documentDF = sqlContext.createDataFrame( - Arrays.asList( - RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), - RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), - RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))), - schema); + DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema); Word2Vec word2Vec = new Word2Vec() .setInputCol("text") diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala index 92b8f84144ab0..ae1ccefd60f53 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala @@ -59,7 +59,7 @@ class DecisionTreeClassifierSuite extends SparkFunSuite with MLlibTestSparkConte test("params") { ParamsSuite.checkParams(new DecisionTreeClassifier) - val model = new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 1, 2) + val model = new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 2) ParamsSuite.checkParams(model) } @@ -311,7 +311,6 @@ private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite { dt: DecisionTreeClassifier, categoricalFeatures: Map[Int, Int], numClasses: Int): Unit = { - val numFeatures = data.first().features.size val oldStrategy = dt.getOldStrategy(categoricalFeatures, numClasses) val oldTree = OldDecisionTree.train(data, oldStrategy) val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses) @@ -320,6 +319,5 @@ private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite { val oldTreeAsNew = DecisionTreeClassificationModel.fromOld( oldTree, newTree.parent.asInstanceOf[DecisionTreeClassifier], categoricalFeatures) TreeTests.checkEqual(oldTreeAsNew, newTree) - assert(newTree.numFeatures === numFeatures) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala index 039141aeb6f67..e3909bccaa5ca 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala @@ -59,8 +59,8 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new GBTClassifier) val model = new GBTClassificationModel("gbtc", - Array(new DecisionTreeRegressionModel("dtr", new LeafNode(0.0, 0.0, null), 1)), - Array(1.0), 1) + Array(new DecisionTreeRegressionModel("dtr", new LeafNode(0.0, 0.0, null))), + Array(1.0)) ParamsSuite.checkParams(model) } @@ -145,7 +145,7 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext { */ } -private object GBTClassifierSuite extends SparkFunSuite { +private object GBTClassifierSuite { /** * Train 2 models on the given dataset, one using the old API and one using the new API. @@ -156,7 +156,6 @@ private object GBTClassifierSuite extends SparkFunSuite { validationData: Option[RDD[LabeledPoint]], gbt: GBTClassifier, categoricalFeatures: Map[Int, Int]): Unit = { - val numFeatures = data.first().features.size val oldBoostingStrategy = gbt.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Classification) val oldGBT = new OldGBT(oldBoostingStrategy) @@ -165,9 +164,7 @@ private object GBTClassifierSuite extends SparkFunSuite { val newModel = gbt.fit(newData) // Use parent from newTree since this is not checked anyways. val oldModelAsNew = GBTClassificationModel.fromOld( - oldModel, newModel.parent.asInstanceOf[GBTClassifier], categoricalFeatures, numFeatures) + oldModel, newModel.parent.asInstanceOf[GBTClassifier], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) - assert(newModel.numFeatures === numFeatures) - assert(oldModelAsNew.numFeatures === numFeatures) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 325faf37e8eea..e4bf363d96076 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -197,8 +197,6 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val model = lr.fit(dataset) assert(model.numClasses === 2) - val numFeatures = dataset.select("features").first().getAs[Vector](0).size - assert(model.numFeatures === numFeatures) val threshold = model.getThreshold val results = model.transform(dataset) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala index a326432d017fc..af0e2e97545b7 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.LogisticRegressionSuite._ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.MulticlassMetrics -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row @@ -74,8 +74,6 @@ class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSp .setSeed(11L) // currently this seed is ignored .setMaxIter(numIterations) val model = trainer.fit(dataFrame) - val numFeatures = dataFrame.select("features").first().getAs[Vector](0).size - assert(model.numFeatures === numFeatures) val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label") .map { case Row(p: Double, l: Double) => (p, l) } // train multinomial logistic regression diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala index fb5f00e0646c6..8f50cb924e64d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala @@ -22,7 +22,6 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors} final class TestProbabilisticClassificationModel( override val uid: String, - override val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] { @@ -46,14 +45,13 @@ class ProbabilisticClassifierSuite extends SparkFunSuite { test("test thresholding") { val thresholds = Array(0.5, 0.2) - val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) - .setThresholds(thresholds) + val testModel = new TestProbabilisticClassificationModel("myuid", 2).setThresholds(thresholds) assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 1.0))) === 1.0) assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 0.2))) === 0.0) } test("test thresholding not required") { - val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) + val testModel = new TestProbabilisticClassificationModel("myuid", 2) assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 2.0))) === 1.0) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala index deb8ec771cb27..b4403ec30049a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala @@ -68,7 +68,7 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte test("params") { ParamsSuite.checkParams(new RandomForestClassifier) val model = new RandomForestClassificationModel("rfc", - Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 1, 2)), 2, 2) + Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 2)), 2, 2) ParamsSuite.checkParams(model) } @@ -209,7 +209,7 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte */ } -private object RandomForestClassifierSuite extends SparkFunSuite { +private object RandomForestClassifierSuite { /** * Train 2 models on the given dataset, one using the old API and one using the new API. @@ -220,7 +220,6 @@ private object RandomForestClassifierSuite extends SparkFunSuite { rf: RandomForestClassifier, categoricalFeatures: Map[Int, Int], numClasses: Int): Unit = { - val numFeatures = data.first().features.size val oldStrategy = rf.getOldStrategy(categoricalFeatures, numClasses, OldAlgo.Classification, rf.getOldImpurity) val oldModel = OldRandomForest.trainClassifier( @@ -234,7 +233,6 @@ private object RandomForestClassifierSuite extends SparkFunSuite { TreeTests.checkEqual(oldModelAsNew, newModel) assert(newModel.hasParent) assert(!newModel.trees.head.asInstanceOf[DecisionTreeClassificationModel].hasParent) - assert(newModel.numClasses === numClasses) - assert(newModel.numFeatures === numFeatures) + assert(newModel.numClasses == numClasses) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala deleted file mode 100644 index e5a42967bd2c8..0000000000000 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.feature - -import org.apache.spark.SparkFunSuite -import org.apache.spark.mllib.linalg.{Vector, Vectors} -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.mllib.util.TestingUtils._ -import org.apache.spark.sql.{Row, SQLContext} - -class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { - test("Test Chi-Square selector") { - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val data = Seq( - LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), - LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), - LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))) - ) - - val preFilteredData = Seq( - Vectors.dense(0.0), - Vectors.dense(6.0), - Vectors.dense(8.0), - Vectors.dense(5.0) - ) - - val df = sc.parallelize(data.zip(preFilteredData)) - .map(x => (x._1.label, x._1.features, x._2)) - .toDF("label", "data", "preFilteredData") - - val model = new ChiSqSelector() - .setNumTopFeatures(1) - .setFeaturesCol("data") - .setLabelCol("label") - .setOutputCol("filtered") - - model.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { - case Row(vec1: Vector, vec2: Vector) => - assert(vec1 ~== vec2 absTol 1e-1) - } - } -} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala deleted file mode 100644 index b2bdd8935f903..0000000000000 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.feature - -import org.apache.spark.ml.attribute.{Attribute, NominalAttribute} -import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{Row, SQLContext} -import org.apache.spark.{SparkContext, SparkFunSuite} - -class QuantileDiscretizerSuite extends SparkFunSuite with MLlibTestSparkContext { - import org.apache.spark.ml.feature.QuantileDiscretizerSuite._ - - test("Test quantile discretizer") { - checkDiscretizedData(sc, - Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), - 10, - Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), - Array("-Infinity, 1.0", "1.0, 2.0", "2.0, 3.0", "3.0, Infinity")) - - checkDiscretizedData(sc, - Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), - 4, - Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), - Array("-Infinity, 1.0", "1.0, 2.0", "2.0, 3.0", "3.0, Infinity")) - - checkDiscretizedData(sc, - Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), - 3, - Array[Double](0, 1, 2, 2, 2, 2, 2, 2, 2), - Array("-Infinity, 2.0", "2.0, 3.0", "3.0, Infinity")) - - checkDiscretizedData(sc, - Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), - 2, - Array[Double](0, 1, 1, 1, 1, 1, 1, 1, 1), - Array("-Infinity, 2.0", "2.0, Infinity")) - - } - - test("Test getting splits") { - val splitTestPoints = Array( - Array[Double]() -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity), - Array(Double.NegativeInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity), - Array(Double.PositiveInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity), - Array(Double.NegativeInfinity, Double.PositiveInfinity) - -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity), - Array(0.0) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity), - Array(1.0) -> Array(Double.NegativeInfinity, 1, Double.PositiveInfinity), - Array(0.0, 1.0) -> Array(Double.NegativeInfinity, 0, 1, Double.PositiveInfinity) - ) - for ((ori, res) <- splitTestPoints) { - assert(QuantileDiscretizer.getSplits(ori) === res, "Returned splits are invalid.") - } - } -} - -private object QuantileDiscretizerSuite extends SparkFunSuite { - - def checkDiscretizedData( - sc: SparkContext, - data: Array[Double], - numBucket: Int, - expectedResult: Array[Double], - expectedAttrs: Array[String]): Unit = { - val sqlCtx = SQLContext.getOrCreate(sc) - import sqlCtx.implicits._ - - val df = sc.parallelize(data.map(Tuple1.apply)).toDF("input") - val discretizer = new QuantileDiscretizer().setInputCol("input").setOutputCol("result") - .setNumBuckets(numBucket) - val result = discretizer.fit(df).transform(df) - - val transformedFeatures = result.select("result").collect() - .map { case Row(transformedFeature: Double) => transformedFeature } - val transformedAttrs = Attribute.fromStructField(result.schema("result")) - .asInstanceOf[NominalAttribute].values.get - - assert(transformedFeatures === expectedResult, - "Transformed features do not equal expected features.") - assert(transformedAttrs === expectedAttrs, - "Transformed attributes do not equal expected attributes.") - } -} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala index 53798c659d4f3..436e66bab09b0 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala @@ -25,24 +25,16 @@ class RFormulaParserSuite extends SparkFunSuite { formula: String, label: String, terms: Seq[String], - schema: StructType = new StructType) { + schema: StructType = null) { val resolved = RFormulaParser.parse(formula).resolve(schema) assert(resolved.label == label) - val simpleTerms = terms.map { t => - if (t.contains(":")) { - t.split(":").toSeq - } else { - Seq(t) - } - } - assert(resolved.terms == simpleTerms) + assert(resolved.terms == terms) } test("parse simple formulas") { checkParse("y ~ x", "y", Seq("x")) checkParse("y ~ x + x", "y", Seq("x")) - checkParse("y~x+z", "y", Seq("x", "z")) - checkParse("y ~ ._fo..o ", "y", Seq("._fo..o")) + checkParse("y ~ ._foo ", "y", Seq("._foo")) checkParse("resp ~ A_VAR + B + c123", "resp", Seq("A_VAR", "B", "c123")) } @@ -87,79 +79,4 @@ class RFormulaParserSuite extends SparkFunSuite { assert(!RFormulaParser.parse("a ~ b - 1").hasIntercept) assert(!RFormulaParser.parse("a ~ b + 1 - 1").hasIntercept) } - - test("parse interactions") { - checkParse("y ~ a:b", "y", Seq("a:b")) - checkParse("y ~ ._a:._x", "y", Seq("._a:._x")) - checkParse("y ~ foo:bar", "y", Seq("foo:bar")) - checkParse("y ~ a : b : c", "y", Seq("a:b:c")) - checkParse("y ~ q + a:b:c + b:c + c:d + z", "y", Seq("q", "a:b:c", "b:c", "c:d", "z")) - } - - test("parse basic interactions with dot") { - val schema = (new StructType) - .add("a", "int", true) - .add("b", "long", false) - .add("c", "string", true) - .add("d", "string", true) - checkParse("a ~ .:b", "a", Seq("b", "c:b", "d:b"), schema) - checkParse("a ~ b:.", "a", Seq("b", "b:c", "b:d"), schema) - checkParse("a ~ .:b:.:.:c:d:.", "a", Seq("b:c:d"), schema) - } - - // Test data generated in R with terms.formula(y ~ .:., data = iris) - test("parse all to all iris interactions") { - val schema = (new StructType) - .add("Sepal.Length", "double", true) - .add("Sepal.Width", "double", true) - .add("Petal.Length", "double", true) - .add("Petal.Width", "double", true) - .add("Species", "string", true) - checkParse( - "y ~ .:.", - "y", - Seq( - "Sepal.Length", - "Sepal.Width", - "Petal.Length", - "Petal.Width", - "Species", - "Sepal.Length:Sepal.Width", - "Sepal.Length:Petal.Length", - "Sepal.Length:Petal.Width", - "Sepal.Length:Species", - "Sepal.Width:Petal.Length", - "Sepal.Width:Petal.Width", - "Sepal.Width:Species", - "Petal.Length:Petal.Width", - "Petal.Length:Species", - "Petal.Width:Species"), - schema) - } - - // Test data generated in R with terms.formula(y ~ .:. - Species:., data = iris) - test("parse interaction negation with iris") { - val schema = (new StructType) - .add("Sepal.Length", "double", true) - .add("Sepal.Width", "double", true) - .add("Petal.Length", "double", true) - .add("Petal.Width", "double", true) - .add("Species", "string", true) - checkParse("y ~ .:. - .:.", "y", Nil, schema) - checkParse( - "y ~ .:. - Species:.", - "y", - Seq( - "Sepal.Length", - "Sepal.Width", - "Petal.Length", - "Petal.Width", - "Sepal.Length:Sepal.Width", - "Sepal.Length:Petal.Length", - "Sepal.Length:Petal.Width", - "Sepal.Width:Petal.Length", - "Sepal.Width:Petal.Width", - "Petal.Length:Petal.Width"), - schema) - } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala index dc20a5ec2152d..c101e07fc3505 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala @@ -137,81 +137,9 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext { val expectedAttrs = new AttributeGroup( "features", Array( - new BinaryAttribute(Some("a_bar"), Some(1)), - new BinaryAttribute(Some("a_foo"), Some(2)), + new BinaryAttribute(Some("a__bar"), Some(1)), + new BinaryAttribute(Some("a__foo"), Some(2)), new NumericAttribute(Some("b"), Some(3)))) assert(attrs === expectedAttrs) } - - test("numeric interaction") { - val formula = new RFormula().setFormula("a ~ b:c:d") - val original = sqlContext.createDataFrame( - Seq((1, 2, 4, 2), (2, 3, 4, 1)) - ).toDF("a", "b", "c", "d") - val model = formula.fit(original) - val result = model.transform(original) - val expected = sqlContext.createDataFrame( - Seq( - (1, 2, 4, 2, Vectors.dense(16.0), 1.0), - (2, 3, 4, 1, Vectors.dense(12.0), 2.0)) - ).toDF("a", "b", "c", "d", "features", "label") - assert(result.collect() === expected.collect()) - val attrs = AttributeGroup.fromStructField(result.schema("features")) - val expectedAttrs = new AttributeGroup( - "features", - Array[Attribute](new NumericAttribute(Some("b:c:d"), Some(1)))) - assert(attrs === expectedAttrs) - } - - test("factor numeric interaction") { - val formula = new RFormula().setFormula("id ~ a:b") - val original = sqlContext.createDataFrame( - Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "baz", 5), (4, "baz", 5), (4, "baz", 5)) - ).toDF("id", "a", "b") - val model = formula.fit(original) - val result = model.transform(original) - val expected = sqlContext.createDataFrame( - Seq( - (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0), - (2, "bar", 4, Vectors.dense(0.0, 4.0, 0.0), 2.0), - (3, "bar", 5, Vectors.dense(0.0, 5.0, 0.0), 3.0), - (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0), - (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0), - (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0)) - ).toDF("id", "a", "b", "features", "label") - assert(result.collect() === expected.collect()) - val attrs = AttributeGroup.fromStructField(result.schema("features")) - val expectedAttrs = new AttributeGroup( - "features", - Array[Attribute]( - new NumericAttribute(Some("a_baz:b"), Some(1)), - new NumericAttribute(Some("a_bar:b"), Some(2)), - new NumericAttribute(Some("a_foo:b"), Some(3)))) - assert(attrs === expectedAttrs) - } - - test("factor factor interaction") { - val formula = new RFormula().setFormula("id ~ a:b") - val original = sqlContext.createDataFrame( - Seq((1, "foo", "zq"), (2, "bar", "zq"), (3, "bar", "zz")) - ).toDF("id", "a", "b") - val model = formula.fit(original) - val result = model.transform(original) - val expected = sqlContext.createDataFrame( - Seq( - (1, "foo", "zq", Vectors.dense(0.0, 0.0, 1.0, 0.0), 1.0), - (2, "bar", "zq", Vectors.dense(1.0, 0.0, 0.0, 0.0), 2.0), - (3, "bar", "zz", Vectors.dense(0.0, 1.0, 0.0, 0.0), 3.0)) - ).toDF("id", "a", "b", "features", "label") - assert(result.collect() === expected.collect()) - val attrs = AttributeGroup.fromStructField(result.schema("features")) - val expectedAttrs = new AttributeGroup( - "features", - Array[Attribute]( - new NumericAttribute(Some("a_bar:b_zq"), Some(1)), - new NumericAttribute(Some("a_bar:b_zz"), Some(2)), - new NumericAttribute(Some("a_foo:b_zq"), Some(3)), - new NumericAttribute(Some("a_foo:b_zz"), Some(4)))) - assert(attrs === expectedAttrs) - } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala index 359f31027172b..ca7140a45ea65 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala @@ -22,7 +22,8 @@ import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.MLTestingUtils -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.{DenseVector, Vectors} +import org.apache.spark.mllib.linalg.BLAS import org.apache.spark.mllib.random.{ExponentialGenerator, WeibullGenerator} import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext @@ -58,20 +59,16 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex assert(aftr.getFitIntercept) assert(aftr.getMaxIter === 100) assert(aftr.getTol === 1E-6) - val model = aftr.setQuantileProbabilities(Array(0.1, 0.8)) - .setQuantilesCol("quantiles") - .fit(datasetUnivariate) + val model = aftr.fit(datasetUnivariate) // copied model must have the same parent. MLTestingUtils.checkCopy(model) model.transform(datasetUnivariate) - .select("label", "prediction", "quantiles") + .select("label", "prediction") .collect() assert(model.getFeaturesCol === "features") assert(model.getPredictionCol === "prediction") - assert(model.getQuantileProbabilities === Array(0.1, 0.8)) - assert(model.getQuantilesCol === "quantiles") assert(model.intercept !== 0.0) assert(model.hasParent) } @@ -111,10 +108,7 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex } test("aft survival regression with univariate") { - val quantileProbabilities = Array(0.1, 0.5, 0.9) - val trainer = new AFTSurvivalRegression() - .setQuantileProbabilities(quantileProbabilities) - .setQuantilesCol("quantiles") + val trainer = new AFTSurvivalRegression val model = trainer.fit(datasetUnivariate) /* @@ -165,25 +159,23 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex [1] 0.1879174 2.6801195 14.5779394 */ val features = Vectors.dense(6.559282795753792) + val quantileProbabilities = Array(0.1, 0.5, 0.9) val responsePredictR = 4.494763 val quantilePredictR = Vectors.dense(0.1879174, 2.6801195, 14.5779394) assert(model.predict(features) ~== responsePredictR relTol 1E-3) + model.setQuantileProbabilities(quantileProbabilities) assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3) - model.transform(datasetUnivariate).select("features", "prediction", "quantiles") - .collect().foreach { - case Row(features: Vector, prediction: Double, quantiles: Vector) => - assert(prediction ~== model.predict(features) relTol 1E-5) - assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5) + model.transform(datasetUnivariate).select("features", "prediction").collect().foreach { + case Row(features: DenseVector, prediction1: Double) => + val prediction2 = math.exp(BLAS.dot(model.coefficients, features) + model.intercept) + assert(prediction1 ~== prediction2 relTol 1E-5) } } test("aft survival regression with multivariate") { - val quantileProbabilities = Array(0.1, 0.5, 0.9) - val trainer = new AFTSurvivalRegression() - .setQuantileProbabilities(quantileProbabilities) - .setQuantilesCol("quantiles") + val trainer = new AFTSurvivalRegression val model = trainer.fit(datasetMultivariate) /* @@ -235,26 +227,23 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex [1] 0.5287044 3.3285858 10.7517072 */ val features = Vectors.dense(2.233396950271428, -2.5321374085997683) + val quantileProbabilities = Array(0.1, 0.5, 0.9) val responsePredictR = 4.761219 val quantilePredictR = Vectors.dense(0.5287044, 3.3285858, 10.7517072) assert(model.predict(features) ~== responsePredictR relTol 1E-3) + model.setQuantileProbabilities(quantileProbabilities) assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3) - model.transform(datasetMultivariate).select("features", "prediction", "quantiles") - .collect().foreach { - case Row(features: Vector, prediction: Double, quantiles: Vector) => - assert(prediction ~== model.predict(features) relTol 1E-5) - assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5) + model.transform(datasetMultivariate).select("features", "prediction").collect().foreach { + case Row(features: DenseVector, prediction1: Double) => + val prediction2 = math.exp(BLAS.dot(model.coefficients, features) + model.intercept) + assert(prediction1 ~== prediction2 relTol 1E-5) } } test("aft survival regression w/o intercept") { - val quantileProbabilities = Array(0.1, 0.5, 0.9) - val trainer = new AFTSurvivalRegression() - .setQuantileProbabilities(quantileProbabilities) - .setQuantilesCol("quantiles") - .setFitIntercept(false) + val trainer = new AFTSurvivalRegression().setFitIntercept(false) val model = trainer.fit(datasetMultivariate) /* @@ -305,31 +294,18 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex [1] 1.452103 25.506077 158.428600 */ val features = Vectors.dense(2.233396950271428, -2.5321374085997683) + val quantileProbabilities = Array(0.1, 0.5, 0.9) val responsePredictR = 44.54465 val quantilePredictR = Vectors.dense(1.452103, 25.506077, 158.428600) assert(model.predict(features) ~== responsePredictR relTol 1E-3) + model.setQuantileProbabilities(quantileProbabilities) assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3) - model.transform(datasetMultivariate).select("features", "prediction", "quantiles") - .collect().foreach { - case Row(features: Vector, prediction: Double, quantiles: Vector) => - assert(prediction ~== model.predict(features) relTol 1E-5) - assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5) - } - } - - test("aft survival regression w/o quantiles column") { - val trainer = new AFTSurvivalRegression - val model = trainer.fit(datasetUnivariate) - val outputDf = model.transform(datasetUnivariate) - - assert(outputDf.schema.fieldNames.contains("quantiles") === false) - - outputDf.select("features", "prediction") - .collect().foreach { - case Row(features: Vector, prediction: Double) => - assert(prediction ~== model.predict(features) relTol 1E-5) + model.transform(datasetMultivariate).select("features", "prediction").collect().foreach { + case Row(features: DenseVector, prediction1: Double) => + val prediction2 = math.exp(BLAS.dot(model.coefficients, features) + model.intercept) + assert(prediction1 ~== prediction2 relTol 1E-5) } } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index e0d5afa7a7e97..fa40054b361c6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -90,7 +90,6 @@ private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite { data: RDD[LabeledPoint], dt: DecisionTreeRegressor, categoricalFeatures: Map[Int, Int]): Unit = { - val numFeatures = data.first().features.size val oldStrategy = dt.getOldStrategy(categoricalFeatures) val oldTree = OldDecisionTree.train(data, oldStrategy) val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) @@ -99,6 +98,5 @@ private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite { val oldTreeAsNew = DecisionTreeRegressionModel.fromOld( oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures) TreeTests.checkEqual(oldTreeAsNew, newTree) - assert(newTree.numFeatures === numFeatures) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala index 09326600e620f..a68197b59193d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala @@ -156,7 +156,7 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext { */ } -private object GBTRegressorSuite extends SparkFunSuite { +private object GBTRegressorSuite { /** * Train 2 models on the given dataset, one using the old API and one using the new API. @@ -167,7 +167,6 @@ private object GBTRegressorSuite extends SparkFunSuite { validationData: Option[RDD[LabeledPoint]], gbt: GBTRegressor, categoricalFeatures: Map[Int, Int]): Unit = { - val numFeatures = data.first().features.size val oldBoostingStrategy = gbt.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Regression) val oldGBT = new OldGBT(oldBoostingStrategy) val oldModel = oldGBT.run(data) @@ -175,9 +174,7 @@ private object GBTRegressorSuite extends SparkFunSuite { val newModel = gbt.fit(newData) // Use parent from newTree since this is not checked anyways. val oldModelAsNew = GBTRegressionModel.fromOld( - oldModel, newModel.parent.asInstanceOf[GBTRegressor], categoricalFeatures, numFeatures) + oldModel, newModel.parent.asInstanceOf[GBTRegressor], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) - assert(newModel.numFeatures === numFeatures) - assert(oldModelAsNew.numFeatures === numFeatures) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index a1d86fe8fedad..d12b5dd275d59 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -23,8 +23,8 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.MLTestingUtils +import org.apache.spark.mllib.linalg.{DenseVector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.linalg.{Vector, DenseVector, Vectors} import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext} import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row} diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala index 7e751e4b553b6..7b1b3f11481de 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala @@ -137,7 +137,6 @@ private object RandomForestRegressorSuite extends SparkFunSuite { data: RDD[LabeledPoint], rf: RandomForestRegressor, categoricalFeatures: Map[Int, Int]): Unit = { - val numFeatures = data.first().features.size val oldStrategy = rf.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, rf.getOldImpurity) val oldModel = OldRandomForest.trainRegressor( @@ -148,6 +147,5 @@ private object RandomForestRegressorSuite extends SparkFunSuite { val oldModelAsNew = RandomForestRegressionModel.fromOld( oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) - assert(newModel.numFeatures === numFeatures) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala index d5c238e9ae164..dc852795c7f62 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala @@ -77,8 +77,7 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { // Forest consisting of (full tree) + (internal node with 2 leafs) val trees = Array(parent, grandParent).map { root => - new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3) - .asInstanceOf[DecisionTreeModel] + new DecisionTreeClassificationModel(root, numClasses = 3).asInstanceOf[DecisionTreeModel] } val importances: Vector = RandomForest.featureImportances(trees, 2) val tree2norm = feature0importance + feature1importance diff --git a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java index a0ba223e340a2..09b79bb6e9130 100644 --- a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java +++ b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java @@ -79,10 +79,6 @@ public TransportClient(Channel channel, TransportResponseHandler handler) { this.handler = Preconditions.checkNotNull(handler); } - public Channel getChannel() { - return channel; - } - public boolean isActive() { return channel.isOpen() || channel.isActive(); } diff --git a/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java b/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java index dbb7f95f55bc0..2ba92a40f8b0a 100644 --- a/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java +++ b/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java @@ -52,6 +52,4 @@ public abstract void receive( * No further requests will come from this client. */ public void connectionTerminated(TransportClient client) { } - - public void exceptionCaught(Throwable cause, TransportClient client) { } } diff --git a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java index 4f67bd573be21..bb734b8bbf12f 100644 --- a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java +++ b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java @@ -74,7 +74,6 @@ public TransportRequestHandler( @Override public void exceptionCaught(Throwable cause) { - rpcHandler.exceptionCaught(cause, reverseClient); } @Override diff --git a/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java b/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java index 922c37a10efdd..57113ed12d414 100644 --- a/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java +++ b/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java @@ -15,24 +15,6 @@ * limitations under the License. */ -/* - * Based on LimitedInputStream.java from Google Guava - * - * Copyright (C) 2007 The Guava Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - package org.apache.spark.network.util; import java.io.FilterInputStream; diff --git a/python/docs/_static/pyspark.css b/python/docs/_static/pyspark.css deleted file mode 100644 index 41106f2f6e26d..0000000000000 --- a/python/docs/_static/pyspark.css +++ /dev/null @@ -1,90 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -body { - background-color: #ffffff; -} - -div.sphinxsidebar { - width: 274px; -} - -div.bodywrapper { - margin: 0 0 0 274px; -} - -div.sphinxsidebar ul { - margin-right: 10px; -} - -div.sphinxsidebar li a { - word-break: break-all; -} - -span.pys-tag { - font-size: 11px; - font-weight: bold; - margin: 0 0 0 2px; - padding: 1px 3px 1px 3px; - -moz-border-radius: 3px; - -webkit-border-radius: 3px; - border-radius: 3px; - text-align: center; - text-decoration: none; -} - -span.pys-tag-experimental { - background-color: rgb(37, 112, 128); - color: rgb(255, 255, 255); -} - -span.pys-tag-deprecated { - background-color: rgb(238, 238, 238); - color: rgb(62, 67, 73); -} - -div.pys-note-experimental { - background-color: rgb(88, 151, 165); - border-color: rgb(59, 115, 127); - color: rgb(255, 255, 255); -} - -div.pys-note-deprecated { -} - -.hasTooltip { - position:relative; -} -.hasTooltip span { - display:none; -} - -.hasTooltip:hover span.tooltip { - display: inline-block; - -moz-border-radius: 2px; - -webkit-border-radius: 2px; - border-radius: 2px; - background-color: rgb(250, 250, 250); - color: rgb(68, 68, 68); - font-weight: normal; - box-shadow: 1px 1px 3px rgb(127, 127, 127); - position: absolute; - padding: 0 3px 0 3px; - top: 1.3em; - left: 14px; - z-index: 9999 -} diff --git a/python/docs/_static/pyspark.js b/python/docs/_static/pyspark.js deleted file mode 100644 index 75e4c42492a48..0000000000000 --- a/python/docs/_static/pyspark.js +++ /dev/null @@ -1,99 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -$(function (){ - - function startsWith(s, prefix) { - return s && s.indexOf(prefix) === 0; - } - - function buildSidebarLinkMap() { - var linkMap = {}; - $('div.sphinxsidebar a.reference.internal').each(function (i,a) { - var href = $(a).attr('href'); - if (startsWith(href, '#module-')) { - var id = href.substr(8); - linkMap[id] = [$(a), null]; - } - }) - return linkMap; - }; - - function getAdNoteDivs(dd) { - var noteDivs = {}; - dd.find('> div.admonition.note > p.last').each(function (i, p) { - var text = $(p).text(); - if (!noteDivs.experimental && startsWith(text, 'Experimental')) { - noteDivs.experimental = $(p).parent(); - } - if (!noteDivs.deprecated && startsWith(text, 'Deprecated')) { - noteDivs.deprecated = $(p).parent(); - } - }); - return noteDivs; - } - - function getParentId(name) { - var last_idx = name.lastIndexOf('.'); - return last_idx == -1? '': name.substr(0, last_idx); - } - - function buildTag(text, cls, tooltip) { - return '' + text + '' - + tooltip + '' - } - - - var sidebarLinkMap = buildSidebarLinkMap(); - - $('dl.class, dl.function').each(function (i,dl) { - - dl = $(dl); - dt = dl.children('dt').eq(0); - dd = dl.children('dd').eq(0); - var id = dt.attr('id'); - var desc = dt.find('> .descname').text(); - var adNoteDivs = getAdNoteDivs(dd); - - if (id) { - var parent_id = getParentId(id); - - var r = sidebarLinkMap[parent_id]; - if (r) { - if (r[1] === null) { - r[1] = $('
    '); - r[0].parent().append(r[1]); - } - var tags = ''; - if (adNoteDivs.experimental) { - tags += buildTag('E', 'pys-tag-experimental', 'Experimental'); - adNoteDivs.experimental.addClass('pys-note pys-note-experimental'); - } - if (adNoteDivs.deprecated) { - tags += buildTag('D', 'pys-tag-deprecated', 'Deprecated'); - adNoteDivs.deprecated.addClass('pys-note pys-note-deprecated'); - } - var li = $('
  • '); - var a = $('' + desc + ''); - li.append(a); - li.append(tags); - r[1].append(li); - sidebarLinkMap[id] = [a, null]; - } - } - }); -}); diff --git a/python/docs/_templates/layout.html b/python/docs/_templates/layout.html deleted file mode 100644 index ab36ebababf88..0000000000000 --- a/python/docs/_templates/layout.html +++ /dev/null @@ -1,6 +0,0 @@ -{% extends "!layout.html" %} -{% set script_files = script_files + ["_static/pyspark.js"] %} -{% set css_files = css_files + ['_static/pyspark.css'] %} -{% block rootrellink %} - {{ super() }} -{% endblock %} diff --git a/python/docs/conf.py b/python/docs/conf.py index 365d6af514177..163987dd8e5fa 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -23,7 +23,7 @@ # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -needs_sphinx = '1.2' +#needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -135,7 +135,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +#html_static_path = ['_static'] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index b02d41b52ab25..6da6931c602e3 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2008,7 +2008,7 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol): Implements the transforms required for fitting a dataset against an R model formula. Currently we support a limited subset of the R - operators, including '~', '.', ':', '+', and '-'. Also see the R formula + operators, including '~', '+', '-', and '.'. Also see the R formula docs: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 765a4511b64bc..978978d7fe4c6 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -301,10 +301,7 @@ def take(self, num): >>> df.take(2) [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ - with SCCallSiteSync(self._sc) as css: - port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe( - self._jdf, num) - return list(_load_from_socket(port, BatchedSerializer(PickleSerializer()))) + return self.limit(num).collect() @ignore_unicode_prefix @since(1.3) @@ -932,8 +929,6 @@ def dropDuplicates(self, subset=None): """Return a new :class:`DataFrame` with duplicate rows removed, optionally only considering certain columns. - :func:`drop_duplicates` is an alias for :func:`dropDuplicates`. - >>> from pyspark.sql import Row >>> df = sc.parallelize([ \ Row(name='Alice', age=5, height=80), \ diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py index 03ea0b6d33c9d..0fee3b2096826 100644 --- a/python/pyspark/statcounter.py +++ b/python/pyspark/statcounter.py @@ -131,28 +131,6 @@ def stdev(self): def sampleStdev(self): return sqrt(self.sampleVariance()) - def asDict(self, sample=False): - """Returns the :class:`StatCounter` members as a ``dict``. - - >>> sc.parallelize([1., 2., 3., 4.]).stats().asDict() - {'count': 4L, - 'max': 4.0, - 'mean': 2.5, - 'min': 1.0, - 'stdev': 1.2909944487358056, - 'sum': 10.0, - 'variance': 1.6666666666666667} - """ - return { - 'count': self.count(), - 'mean': self.mean(), - 'sum': self.sum(), - 'min': self.min(), - 'max': self.max(), - 'stdev': self.stdev() if sample else self.sampleStdev(), - 'variance': self.variance() if sample else self.sampleVariance() - } - def __repr__(self): return ("(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" % (self.count(), self.mean(), self.stdev(), self.max(), self.min())) diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 5bd94476597ab..d8c1560643e67 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -1986,26 +1986,6 @@ def test_statcounter_array(self): self.assertSequenceEqual([3.0, 3.0], s.max().tolist()) self.assertSequenceEqual([1.0, 1.0], s.sampleStdev().tolist()) - stats_dict = s.asDict() - self.assertEqual(3, stats_dict['count']) - self.assertSequenceEqual([2.0, 2.0], stats_dict['mean'].tolist()) - self.assertSequenceEqual([1.0, 1.0], stats_dict['min'].tolist()) - self.assertSequenceEqual([3.0, 3.0], stats_dict['max'].tolist()) - self.assertSequenceEqual([6.0, 6.0], stats_dict['sum'].tolist()) - self.assertSequenceEqual([1.0, 1.0], stats_dict['stdev'].tolist()) - self.assertSequenceEqual([1.0, 1.0], stats_dict['variance'].tolist()) - - stats_sample_dict = s.asDict(sample=True) - self.assertEqual(3, stats_dict['count']) - self.assertSequenceEqual([2.0, 2.0], stats_sample_dict['mean'].tolist()) - self.assertSequenceEqual([1.0, 1.0], stats_sample_dict['min'].tolist()) - self.assertSequenceEqual([3.0, 3.0], stats_sample_dict['max'].tolist()) - self.assertSequenceEqual([6.0, 6.0], stats_sample_dict['sum'].tolist()) - self.assertSequenceEqual( - [0.816496580927726, 0.816496580927726], stats_sample_dict['stdev'].tolist()) - self.assertSequenceEqual( - [0.6666666666666666, 0.6666666666666666], stats_sample_dict['variance'].tolist()) - if __name__ == "__main__": if not _have_scipy: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index d4334d16289a5..57af41ca2a3e5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -208,7 +208,6 @@ object FunctionRegistry { expression[FormatNumber]("format_number"), expression[GetJsonObject]("get_json_object"), expression[InitCap]("initcap"), - expression[JsonTuple]("json_tuple"), expression[Lower]("lcase"), expression[Lower]("lower"), expression[Length]("length"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala index 394be47a588b7..35b74024a4cab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala index a2fab258fcac3..1eefc9f9430b2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.expressions.aggregate +import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext} import org.apache.spark.sql.catalyst.InternalRow diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 8c9853e628d2c..1593d3c24978e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -21,9 +21,8 @@ import java.io.{StringWriter, ByteArrayOutputStream} import com.fasterxml.jackson.core._ import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.types.{StructField, StructType, StringType, DataType} +import org.apache.spark.sql.types.{StringType, DataType} import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils @@ -94,8 +93,8 @@ private[this] object JsonPathParser extends RegexParsers { } } -private[this] object SharedFactory { - val jsonFactory = new JsonFactory() +private[this] object GetJsonObject { + private val jsonFactory = new JsonFactory() // Enabled for Hive compatibility jsonFactory.enable(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS) @@ -108,7 +107,7 @@ private[this] object SharedFactory { case class GetJsonObject(json: Expression, path: Expression) extends BinaryExpression with ExpectsInputTypes with CodegenFallback { - import SharedFactory._ + import GetJsonObject._ import PathInstruction._ import WriteStyle._ import com.fasterxml.jackson.core.JsonToken._ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index 28f616fbb9ca5..d46dff79147f4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -52,7 +52,7 @@ abstract class LeafMathExpression(c: Double, name: String) * @param f The math function. * @param name The short name of the function */ -abstract class UnaryMathExpression(val f: Double => Double, name: String) +abstract class UnaryMathExpression(f: Double => Double, name: String) extends UnaryExpression with Serializable with ImplicitCastInputTypes { override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index 9ab5c299d0f55..13a7c83696e62 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -245,14 +245,6 @@ case class AttributeReference( } } - def withExprId(newExprId: ExprId): AttributeReference = { - if (exprId == newExprId) { - this - } else { - AttributeReference(name, dataType, nullable, metadata)(newExprId, qualifiers) - } - } - override def toString: String = s"$name#${exprId.id}$typeSuffix" // Since the expression id is not in the first constructor it is missing from the default diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 8770c4b76c2e5..e1b0306271b1f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -18,13 +18,15 @@ package org.apache.spark.sql.catalyst.expressions import java.text.DecimalFormat -import java.util.{HashMap, Locale, Map => JMap} +import java.util.Arrays +import java.util.{Map => JMap, HashMap} +import java.util.Locale import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +import org.apache.spark.unsafe.types.UTF8String //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines expressions for string operations. @@ -689,6 +691,34 @@ case class StringSpace(child: Expression) override def prettyName: String = "space" } +object Substring { + def subStringBinarySQL(bytes: Array[Byte], pos: Int, len: Int): Array[Byte] = { + if (pos > bytes.length) { + return Array[Byte]() + } + + var start = if (pos > 0) { + pos - 1 + } else if (pos < 0) { + bytes.length + pos + } else { + 0 + } + + val end = if ((bytes.length - start) < len) { + bytes.length + } else { + start + len + } + + start = Math.max(start, 0) // underflow + if (start < end) { + Arrays.copyOfRange(bytes, start, end) + } else { + Array[Byte]() + } + } +} /** * A function that takes a substring of its first argument starting at a given position. * Defined for String and Binary types. @@ -711,17 +741,18 @@ case class Substring(str: Expression, pos: Expression, len: Expression) str.dataType match { case StringType => string.asInstanceOf[UTF8String] .substringSQL(pos.asInstanceOf[Int], len.asInstanceOf[Int]) - case BinaryType => ByteArray.subStringSQL(string.asInstanceOf[Array[Byte]], + case BinaryType => Substring.subStringBinarySQL(string.asInstanceOf[Array[Byte]], pos.asInstanceOf[Int], len.asInstanceOf[Int]) } } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + val cls = classOf[Substring].getName defineCodeGen(ctx, ev, (string, pos, len) => { str.dataType match { case StringType => s"$string.substringSQL($pos, $len)" - case BinaryType => s"${classOf[ByteArray].getName}.subStringSQL($string, $pos, $len)" + case BinaryType => s"$cls.subStringBinarySQL($string, $pos, $len)" } }) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index c7a1a2e7469ee..6ec9a107fb8b5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -147,13 +147,7 @@ final class Decimal extends Ordered[Decimal] with Serializable { } } - def toJavaBigDecimal: java.math.BigDecimal = { - if (decimalVal.ne(null)) { - decimalVal.underlying() - } else { - java.math.BigDecimal.valueOf(longVal, _scale) - } - } + def toJavaBigDecimal: java.math.BigDecimal = toBigDecimal.underlying() def toUnscaledLong: Long = { if (decimalVal.ne(null)) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index f33125f463e14..4addbaf0cbce7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -18,8 +18,6 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.unsafe.types.UTF8String class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val json = @@ -201,116 +199,4 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { GetJsonObject(NonFoldableLiteral(json), NonFoldableLiteral("$.fb:testid")), "1234") } - - val jsonTupleQuery = Literal("f1") :: - Literal("f2") :: - Literal("f3") :: - Literal("f4") :: - Literal("f5") :: - Nil - - test("json_tuple - hive key 1") { - checkEvaluation( - JsonTuple( - Literal("""{"f1": "value1", "f2": "value2", "f3": 3, "f5": 5.23}""") :: - jsonTupleQuery), - InternalRow.fromSeq(Seq("value1", "value2", "3", null, "5.23").map(UTF8String.fromString))) - } - - test("json_tuple - hive key 2") { - checkEvaluation( - JsonTuple( - Literal("""{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") :: - jsonTupleQuery), - InternalRow.fromSeq(Seq("value12", "2", "value3", "4.01", null).map(UTF8String.fromString))) - } - - test("json_tuple - hive key 2 (mix of foldable fields)") { - checkEvaluation( - JsonTuple(Literal("""{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") :: - Literal("f1") :: - NonFoldableLiteral("f2") :: - NonFoldableLiteral("f3") :: - Literal("f4") :: - Literal("f5") :: - Nil), - InternalRow.fromSeq(Seq("value12", "2", "value3", "4.01", null).map(UTF8String.fromString))) - } - - test("json_tuple - hive key 3") { - checkEvaluation( - JsonTuple( - Literal("""{"f1": "value13", "f4": "value44", "f3": "value33", "f2": 2, "f5": 5.01}""") :: - jsonTupleQuery), - InternalRow.fromSeq( - Seq("value13", "2", "value33", "value44", "5.01").map(UTF8String.fromString))) - } - - test("json_tuple - hive key 3 (nonfoldable json)") { - checkEvaluation( - JsonTuple( - NonFoldableLiteral( - """{"f1": "value13", "f4": "value44", - | "f3": "value33", "f2": 2, "f5": 5.01}""".stripMargin) - :: jsonTupleQuery), - InternalRow.fromSeq( - Seq("value13", "2", "value33", "value44", "5.01").map(UTF8String.fromString))) - } - - test("json_tuple - hive key 3 (nonfoldable fields)") { - checkEvaluation( - JsonTuple(Literal( - """{"f1": "value13", "f4": "value44", - | "f3": "value33", "f2": 2, "f5": 5.01}""".stripMargin) :: - NonFoldableLiteral("f1") :: - NonFoldableLiteral("f2") :: - NonFoldableLiteral("f3") :: - NonFoldableLiteral("f4") :: - NonFoldableLiteral("f5") :: - Nil), - InternalRow.fromSeq( - Seq("value13", "2", "value33", "value44", "5.01").map(UTF8String.fromString))) - } - - test("json_tuple - hive key 4 - null json") { - checkEvaluation( - JsonTuple(Literal(null) :: jsonTupleQuery), - InternalRow.fromSeq(Seq(null, null, null, null, null))) - } - - test("json_tuple - hive key 5 - null and empty fields") { - checkEvaluation( - JsonTuple(Literal("""{"f1": "", "f5": null}""") :: jsonTupleQuery), - InternalRow.fromSeq(Seq(UTF8String.fromString(""), null, null, null, null))) - } - - test("json_tuple - hive key 6 - invalid json (array)") { - checkEvaluation( - JsonTuple(Literal("[invalid JSON string]") :: jsonTupleQuery), - InternalRow.fromSeq(Seq(null, null, null, null, null))) - } - - test("json_tuple - invalid json (object start only)") { - checkEvaluation( - JsonTuple(Literal("{") :: jsonTupleQuery), - InternalRow.fromSeq(Seq(null, null, null, null, null))) - } - - test("json_tuple - invalid json (no object end)") { - checkEvaluation( - JsonTuple(Literal("""{"foo": "bar"""") :: jsonTupleQuery), - InternalRow.fromSeq(Seq(null, null, null, null, null))) - } - - test("json_tuple - invalid json (invalid json)") { - checkEvaluation( - JsonTuple(Literal("\\") :: jsonTupleQuery), - InternalRow.fromSeq(Seq(null, null, null, null, null))) - } - - test("json_tuple - preserve newlines") { - checkEvaluation( - JsonTuple(Literal("{\"a\":\"b\nc\"}") :: Literal("a") :: Nil), - InternalRow.fromSeq(Seq(UTF8String.fromString("b\nc")))) - } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala index 88ed9fdd6465f..f6f9af104f498 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala @@ -244,7 +244,7 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("ceil") { - testUnary(Ceil, (d: Double) => math.ceil(d).toLong) + testUnary(Ceil, math.ceil) checkConsistencyBetweenInterpretedAndCodegen(Ceil, DoubleType) testUnary(Ceil, (d: Decimal) => d.ceil, (-20 to 20).map(x => Decimal(x * 0.1))) @@ -254,7 +254,7 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("floor") { - testUnary(Floor, (d: Double) => math.floor(d).toLong) + testUnary(Floor, math.floor) checkConsistencyBetweenInterpretedAndCodegen(Floor, DoubleType) testUnary(Floor, (d: Decimal) => d.floor, (-20 to 20).map(x => Decimal(x * 0.1))) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala deleted file mode 100644 index 0d329497758c6..0000000000000 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.expressions.aggregate - -import java.util.Random - -import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, MutableRow, BoundReference} -import org.apache.spark.sql.types.{DataType, IntegerType} - -import scala.collection.mutable -import org.scalatest.Assertions._ - -class HyperLogLogPlusPlusSuite extends SparkFunSuite { - - /** Create a HLL++ instance and an input and output buffer. */ - def createEstimator(rsd: Double, dt: DataType = IntegerType): - (HyperLogLogPlusPlus, MutableRow, MutableRow) = { - val input = new SpecificMutableRow(Seq(dt)) - val hll = new HyperLogLogPlusPlus(new BoundReference(0, dt, true), rsd) - val buffer = createBuffer(hll) - (hll, input, buffer) - } - - def createBuffer(hll: HyperLogLogPlusPlus): MutableRow = { - val buffer = new SpecificMutableRow(hll.aggBufferAttributes.map(_.dataType)) - hll.initialize(buffer) - buffer - } - - /** Evaluate the estimate. It should be within 3*SD's of the given true rsd. */ - def evaluateEstimate(hll: HyperLogLogPlusPlus, buffer: MutableRow, cardinality: Int): Unit = { - val estimate = hll.eval(buffer).asInstanceOf[Long].toDouble - val error = math.abs((estimate / cardinality.toDouble) - 1.0d) - assert(error < hll.trueRsd * 3.0d, "Error should be within 3 std. errors.") - } - - test("add nulls") { - val (hll, input, buffer) = createEstimator(0.05) - input.setNullAt(0) - hll.update(buffer, input) - hll.update(buffer, input) - val estimate = hll.eval(buffer).asInstanceOf[Long] - assert(estimate == 0L, "Nothing meaningful added; estimate should be 0.") - } - - def testCardinalityEstimates( - rsds: Seq[Double], - ns: Seq[Int], - f: Int => Int, - c: Int => Int): Unit = { - rsds.flatMap(rsd => ns.map(n => (rsd, n))).foreach { - case (rsd, n) => - val (hll, input, buffer) = createEstimator(rsd) - var i = 0 - while (i < n) { - input.setInt(0, f(i)) - hll.update(buffer, input) - i += 1 - } - val estimate = hll.eval(buffer).asInstanceOf[Long].toDouble - val cardinality = c(n) - val error = math.abs((estimate / cardinality.toDouble) - 1.0d) - assert(error < hll.trueRsd * 3.0d, "Error should be within 3 std. errors.") - } - } - - test("deterministic cardinality estimation") { - val repeats = 10 - testCardinalityEstimates( - Seq(0.1, 0.05, 0.025, 0.01), - Seq(100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000).map(_ * repeats), - i => i / repeats, - i => i / repeats) - } - - test("random cardinality estimation") { - val srng = new Random(323981238L) - val seen = mutable.HashSet.empty[Int] - val update = (i: Int) => { - val value = srng.nextInt() - seen += value - value - } - val eval = (n: Int) => { - val cardinality = seen.size - seen.clear() - cardinality - } - testCardinalityEstimates( - Seq(0.05, 0.01), - Seq(100, 10000, 500000), - update, - eval) - } - - // Test merging - test("merging HLL instances") { - val (hll, input, buffer1a) = createEstimator(0.05) - val buffer1b = createBuffer(hll) - val buffer2 = createBuffer(hll) - - // Create the - // Add the lower half - var i = 0 - while (i < 500000) { - input.setInt(0, i) - hll.update(buffer1a, input) - i += 1 - } - - // Add the upper half - i = 500000 - while (i < 1000000) { - input.setInt(0, i) - hll.update(buffer1b, input) - i += 1 - } - - // Merge the lower and upper halfs. - hll.merge(buffer1a, buffer1b) - - // Create the other buffer in reverse - i = 999999 - while (i >= 0) { - input.setInt(0, i) - hll.update(buffer2, input) - i -= 1 - } - - // Check if the buffers are equal. - assert(buffer2 == buffer1a, "Buffers should be equal") - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index f2d4db5550273..69444bf23d88d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1680,7 +1680,7 @@ class DataFrame private[sql]( */ def toJSON: RDD[String] = { val rowSchema = this.schema - queryExecution.toRdd.mapPartitions { iter => + this.mapPartitions { iter => val writer = new CharArrayWriter() // create the Generator without separator inserted between 2 records val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) @@ -1711,7 +1711,7 @@ class DataFrame private[sql]( */ def inputFiles: Array[String] = { val files: Seq[String] = logicalPlan.collect { - case LogicalRelation(fsBasedRelation: FileRelation, _) => + case LogicalRelation(fsBasedRelation: FileRelation) => fsBasedRelation.inputFiles case fr: FileRelation => fr.inputFiles diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala index ed8b634ad5630..d94dd83b11292 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala @@ -332,7 +332,8 @@ private[spark] object SQLConf { "subclass of org.apache.hadoop.mapreduce.OutputCommitter. Typically, it's also a subclass " + "of org.apache.parquet.hadoop.ParquetOutputCommitter. NOTE: 1. Instead of SQLConf, this " + "option must be set in Hadoop Configuration. 2. This option overrides " + - "\"spark.sql.sources.outputCommitterClass\".") + "\"spark.sql.sources.outputCommitterClass\"." + ) val ORC_FILTER_PUSHDOWN_ENABLED = booleanConf("spark.sql.orc.filterPushdown", defaultValue = Some(false), @@ -543,7 +544,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf { private[spark] def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP) - private[spark] def writeLegacyParquetFormat: Boolean = getConf(PARQUET_WRITE_LEGACY_FORMAT) + private[spark] def followParquetFormatSpec: Boolean = getConf(PARQUET_FOLLOW_PARQUET_FORMAT_SPEC) private[spark] def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 5ad3871093fc8..16a36fbc6d454 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import java.beans.{BeanInfo, Introspector} +import java.beans.Introspector import java.util.Properties import java.util.concurrent.atomic.AtomicReference @@ -557,12 +557,21 @@ class SQLContext private[sql]( * @since 1.3.0 */ def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = { - val attributeSeq: Seq[AttributeReference] = getSchema(beanClass) + val attributeSeq = getSchema(beanClass) val className = beanClass.getName val rowRdd = rdd.mapPartitions { iter => // BeanInfo is not serializable so we must rediscover it remotely for each partition. val localBeanInfo = Introspector.getBeanInfo(Utils.classForName(className)) - SQLContext.beansToRows(iter, localBeanInfo, attributeSeq) + val extractors = + localBeanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod) + val methodsToConverts = extractors.zip(attributeSeq).map { case (e, attr) => + (e, CatalystTypeConverters.createToCatalystConverter(attr.dataType)) + } + iter.map { row => + new GenericInternalRow( + methodsToConverts.map { case (e, convert) => convert(e.invoke(row)) }.toArray[Any] + ): InternalRow + } } DataFrame(this, LogicalRDD(attributeSeq, rowRdd)(this)) } @@ -579,23 +588,6 @@ class SQLContext private[sql]( createDataFrame(rdd.rdd, beanClass) } - /** - * Applies a schema to an List of Java Beans. - * - * WARNING: Since there is no guaranteed ordering for fields in a Java Bean, - * SELECT * queries will return the columns in an undefined order. - * @group dataframes - * @since 1.6.0 - */ - def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = { - val attrSeq = getSchema(beanClass) - val className = beanClass.getName - val beanInfo = Introspector.getBeanInfo(beanClass) - val rows = SQLContext.beansToRows(data.asScala.iterator, beanInfo, attrSeq) - DataFrame(self, LocalRelation(attrSeq, rows.toSeq)) - } - - /** * :: Experimental :: * Returns a [[DataFrameReader]] that can be used to read data in as a [[DataFrame]]. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala index ba61003ba41c6..7618a0c216ad4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala @@ -213,8 +213,8 @@ private[sql] class StringColumnStats extends ColumnStats { super.gatherStats(row, ordinal) if (!row.isNullAt(ordinal)) { val value = row.getUTF8String(ordinal) - if (upper == null || value.compareTo(upper) > 0) upper = value.clone() - if (lower == null || value.compareTo(lower) < 0) lower = value.clone() + if (upper == null || value.compareTo(upper) > 0) upper = value + if (lower == null || value.compareTo(lower) < 0) lower = value sizeInBytes += STRING.actualSize(row, ordinal) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala index ba7f6287ac6c3..4401c3c5769cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala @@ -33,11 +33,13 @@ private[sql] case class LocalTableScan( protected override def doExecute(): RDD[InternalRow] = rdd - override def executeCollect(): Array[InternalRow] = { - rows.toArray + override def executeCollect(): Array[Row] = { + val converter = CatalystTypeConverters.createToScalaConverter(schema) + rows.map(converter(_).asInstanceOf[Row]).toArray } - override def executeTake(limit: Int): Array[InternalRow] = { - rows.take(limit).toArray + override def executeTake(limit: Int): Array[Row] = { + val converter = CatalystTypeConverters.createToScalaConverter(schema) + rows.map(converter(_).asInstanceOf[Row]).take(limit).toArray } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala index 42891287a3006..53a45e34ded2e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala @@ -147,12 +147,6 @@ class ShuffledRowRDD( } } - override def getPreferredLocations(partition: Partition): Seq[String] = { - val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] - val dep = dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] - tracker.getPreferredLocationsForShuffle(dep, partition.index) - } - override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { val shuffledRowPartition = split.asInstanceOf[ShuffledRowRDDPartition] // The range of pre-shuffle partitions that we are fetching at here is diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala index 8bb293ae87e64..fb5f058907466 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala @@ -168,16 +168,11 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ /** * Runs this query returning the result as an array. */ - def executeCollect(): Array[InternalRow] = { - execute().map(_.copy()).collect() - } - - /** - * Runs this query returning the result as an array, using external Row format. - */ - def executeCollectPublic(): Array[Row] = { - val converter = CatalystTypeConverters.createToScalaConverter(schema) - executeCollect().map(converter(_).asInstanceOf[Row]) + def executeCollect(): Array[Row] = { + execute().mapPartitions { iter => + val converter = CatalystTypeConverters.createToScalaConverter(schema) + iter.map(converter(_).asInstanceOf[Row]) + }.collect() } /** @@ -185,9 +180,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ * * This is modeled after RDD.take but never runs any job locally on the driver. */ - def executeTake(n: Int): Array[InternalRow] = { + def executeTake(n: Int): Array[Row] = { if (n == 0) { - return new Array[InternalRow](0) + return new Array[Row](0) } val childRDD = execute().map(_.copy()) @@ -221,7 +216,8 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ partsScanned += numPartsToTry } - buf.toArray + val converter = CatalystTypeConverters.createToScalaConverter(schema) + buf.toArray.map(converter(_).asInstanceOf[Row]) } private[this] def isTesting: Boolean = sys.props.contains("spark.testing") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala index 7e981268de392..e060c06d9e2a2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala @@ -45,9 +45,16 @@ private[sql] class UnsafeRowSerializer(numFields: Int) extends Serializer with S } private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInstance { + + /** + * Marks the end of a stream written with [[serializeStream()]]. + */ + private[this] val EOF: Int = -1 + /** * Serializes a stream of UnsafeRows. Within the stream, each record consists of a record * length (stored as a 4-byte integer, written high byte first), followed by the record's bytes. + * The end of the stream is denoted by a record with the special length `EOF` (-1). */ override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream { private[this] var writeBuffer: Array[Byte] = new Array[Byte](4096) @@ -85,6 +92,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst override def close(): Unit = { writeBuffer = null + dOut.writeInt(EOF) dOut.close() } } @@ -96,20 +104,12 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst private[this] var rowBuffer: Array[Byte] = new Array[Byte](1024) private[this] var row: UnsafeRow = new UnsafeRow() private[this] var rowTuple: (Int, UnsafeRow) = (0, row) - private[this] val EOF: Int = -1 override def asKeyValueIterator: Iterator[(Int, UnsafeRow)] = { new Iterator[(Int, UnsafeRow)] { + private[this] var rowSize: Int = dIn.readInt() + if (rowSize == EOF) dIn.close() - private[this] def readSize(): Int = try { - dIn.readInt() - } catch { - case e: EOFException => - dIn.close() - EOF - } - - private[this] var rowSize: Int = readSize() override def hasNext: Boolean = rowSize != EOF override def next(): (Int, UnsafeRow) = { @@ -118,7 +118,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst } ByteStreams.readFully(dIn, rowBuffer, 0, rowSize) row.pointTo(rowBuffer, Platform.BYTE_ARRAY_OFFSET, numFields, rowSize) - rowSize = readSize() + rowSize = dIn.readInt() // read the next row's size if (rowSize == EOF) { // We are returning the last row in this stream dIn.close() val _rowTuple = rowTuple diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala index 99fb7a40b72e1..11a2867a301ad 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._ import scala.collection.mutable.ArrayBuffer /** - * The base class of [[SortBasedAggregationIterator]]. + * The base class of [[SortBasedAggregationIterator]] and [[UnsafeHybridAggregationIterator]]. * It mainly contains two parts: * 1. It initializes aggregate functions. * 2. It creates two functions, `processRow` and `generateOutput` based on [[AggregateMode]] of diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala index 799650a4f784f..dc7d441faf70d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala @@ -182,7 +182,7 @@ case class Limit(limit: Int, child: SparkPlan) override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = SinglePartition - override def executeCollect(): Array[InternalRow] = child.executeTake(limit) + override def executeCollect(): Array[Row] = child.executeTake(limit) protected override def doExecute(): RDD[InternalRow] = { val rdd: RDD[_ <: Product2[Boolean, InternalRow]] = if (sortBasedShuffleOn) { @@ -234,8 +234,9 @@ case class TakeOrderedAndProject( projection.map(data.map(_)).getOrElse(data) } - override def executeCollect(): Array[InternalRow] = { - collectData() + override def executeCollect(): Array[Row] = { + val converter = CatalystTypeConverters.createToScalaConverter(schema) + collectData().map(converter(_).asInstanceOf[Row]) } // TODO: Terminal split should be implemented differently from non-terminal split. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala index e5f60b15e7359..6c1db0f423947 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -53,21 +53,20 @@ private[sql] case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan * The `execute()` method of all the physical command classes should reference `sideEffectResult` * so that the command can be executed eagerly right after the command query is created. */ - protected[sql] lazy val sideEffectResult: Seq[InternalRow] = { - val converter = CatalystTypeConverters.createToCatalystConverter(schema) - cmd.run(sqlContext).map(converter(_).asInstanceOf[InternalRow]) - } + protected[sql] lazy val sideEffectResult: Seq[Row] = cmd.run(sqlContext) override def output: Seq[Attribute] = cmd.output override def children: Seq[SparkPlan] = Nil - override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray + override def executeCollect(): Array[Row] = sideEffectResult.toArray - override def executeTake(limit: Int): Array[InternalRow] = sideEffectResult.take(limit).toArray + override def executeTake(limit: Int): Array[Row] = sideEffectResult.take(limit).toArray protected override def doExecute(): RDD[InternalRow] = { - sqlContext.sparkContext.parallelize(sideEffectResult, 1) + val convert = CatalystTypeConverters.createToCatalystConverter(schema) + val converted = sideEffectResult.map(convert(_).asInstanceOf[InternalRow]) + sqlContext.sparkContext.parallelize(converted, 1) } override def argString: String = cmd.toString diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 7265d6a4de2e6..a916d143d0256 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -38,7 +38,7 @@ import org.apache.spark.{Logging, TaskContext} */ private[sql] object DataSourceStrategy extends Strategy with Logging { def apply(plan: LogicalPlan): Seq[execution.SparkPlan] = plan match { - case PhysicalOperation(projects, filters, l @ LogicalRelation(t: CatalystScan, _)) => + case PhysicalOperation(projects, filters, l @ LogicalRelation(t: CatalystScan)) => pruneFilterProjectRaw( l, projects, @@ -46,14 +46,14 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { (requestedColumns, allPredicates, _) => toCatalystRDD(l, requestedColumns, t.buildScan(requestedColumns, allPredicates))) :: Nil - case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedFilteredScan, _)) => + case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedFilteredScan)) => pruneFilterProject( l, projects, filters, (a, f) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f))) :: Nil - case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedScan, _)) => + case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedScan)) => pruneFilterProject( l, projects, @@ -61,7 +61,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { (a, _) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray))) :: Nil // Scanning partitioned HadoopFsRelation - case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _)) + case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation)) if t.partitionSpec.partitionColumns.nonEmpty => // We divide the filter expressions into 3 parts val partitionColumns = AttributeSet( @@ -97,7 +97,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { .map(execution.Filter(_, scan)).getOrElse(scan) :: Nil // Scanning non-partitioned HadoopFsRelation - case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _)) => + case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation)) => // See buildPartitionedTableScan for the reason that we need to create a shard // broadcast HadoopConf. val sharedHadoopConf = SparkHadoopUtil.get.conf @@ -109,16 +109,16 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { filters, (a, f) => t.buildInternalScan(a.map(_.name).toArray, f, t.paths, confBroadcast)) :: Nil - case l @ LogicalRelation(baseRelation: TableScan, _) => + case l @ LogicalRelation(baseRelation: TableScan) => execution.PhysicalRDD.createFromDataSource( l.output, toCatalystRDD(l, baseRelation.buildScan()), baseRelation) :: Nil - case i @ logical.InsertIntoTable(l @ LogicalRelation(t: InsertableRelation, _), - part, query, overwrite, false) if part.isEmpty => + case i @ logical.InsertIntoTable( + l @ LogicalRelation(t: InsertableRelation), part, query, overwrite, false) if part.isEmpty => execution.ExecutedCommand(InsertIntoDataSource(l, query, overwrite)) :: Nil case i @ logical.InsertIntoTable( - l @ LogicalRelation(t: HadoopFsRelation, _), part, query, overwrite, false) => + l @ LogicalRelation(t: HadoopFsRelation), part, query, overwrite, false) => val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append execution.ExecutedCommand(InsertIntoHadoopFsRelation(t, query, mode)) :: Nil diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala index 219dae88e515d..680ccc79c3ced 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala @@ -17,40 +17,23 @@ package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} +import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation /** * Used to link a [[BaseRelation]] in to a logical query plan. - * - * Note that sometimes we need to use `LogicalRelation` to replace an existing leaf node without - * changing the output attributes' IDs. The `expectedOutputAttributes` parameter is used for - * this purpose. See https://issues.apache.org/jira/browse/SPARK-10741 for more details. */ -case class LogicalRelation( - relation: BaseRelation, - expectedOutputAttributes: Option[Seq[Attribute]] = None) - extends LeafNode with MultiInstanceRelation { +private[sql] case class LogicalRelation(relation: BaseRelation) + extends LeafNode + with MultiInstanceRelation { - override val output: Seq[AttributeReference] = { - val attrs = relation.schema.toAttributes - expectedOutputAttributes.map { expectedAttrs => - assert(expectedAttrs.length == attrs.length) - attrs.zip(expectedAttrs).map { - // We should respect the attribute names provided by base relation and only use the - // exprId in `expectedOutputAttributes`. - // The reason is that, some relations(like parquet) will reconcile attribute names to - // workaround case insensitivity issue. - case (attr, expected) => attr.withExprId(expected.exprId) - } - }.getOrElse(attrs) - } + override val output: Seq[AttributeReference] = relation.schema.toAttributes // Logical Relations are distinct if they have different output for the sake of transformations. override def equals(other: Any): Boolean = other match { - case l @ LogicalRelation(otherRelation, _) => relation == otherRelation && output == l.output - case _ => false + case l @ LogicalRelation(otherRelation) => relation == otherRelation && output == l.output + case _ => false } override def hashCode: Int = { @@ -58,7 +41,7 @@ case class LogicalRelation( } override def sameResult(otherPlan: LogicalPlan): Boolean = otherPlan match { - case LogicalRelation(otherRelation, _) => relation == otherRelation + case LogicalRelation(otherRelation) => relation == otherRelation case _ => false } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala index 85b52f04c8d01..6f17bdfb14f84 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala @@ -199,7 +199,7 @@ private[json] class JsonOutputWriter( override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal") override protected[sql] def writeInternal(row: InternalRow): Unit = { - JacksonGenerator(dataSchema, gen)(row) + JacksonGenerator(dataSchema, gen, row) gen.flush() result.set(writer.toString) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala index 3f34520afe6b6..9c06b4c320527 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala @@ -28,6 +28,88 @@ import org.apache.spark.sql.Row import org.apache.spark.sql.types._ private[sql] object JacksonGenerator { + /** Transforms a single Row to JSON using Jackson + * + * @param rowSchema the schema object used for conversion + * @param gen a JsonGenerator object + * @param row The row to convert + */ + def apply(rowSchema: StructType, gen: JsonGenerator)(row: Row): Unit = { + def valWriter: (DataType, Any) => Unit = { + case (_, null) | (NullType, _) => gen.writeNull() + case (StringType, v: String) => gen.writeString(v) + case (TimestampType, v: java.sql.Timestamp) => gen.writeString(v.toString) + case (IntegerType, v: Int) => gen.writeNumber(v) + case (ShortType, v: Short) => gen.writeNumber(v) + case (FloatType, v: Float) => gen.writeNumber(v) + case (DoubleType, v: Double) => gen.writeNumber(v) + case (LongType, v: Long) => gen.writeNumber(v) + case (DecimalType(), v: java.math.BigDecimal) => gen.writeNumber(v) + case (ByteType, v: Byte) => gen.writeNumber(v.toInt) + case (BinaryType, v: Array[Byte]) => gen.writeBinary(v) + case (BooleanType, v: Boolean) => gen.writeBoolean(v) + case (DateType, v) => gen.writeString(v.toString) + case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, udt.serialize(v)) + + case (ArrayType(ty, _), v: Seq[_]) => + gen.writeStartArray() + v.foreach(valWriter(ty, _)) + gen.writeEndArray() + + case (MapType(kv, vv, _), v: Map[_, _]) => + gen.writeStartObject() + v.foreach { p => + gen.writeFieldName(p._1.toString) + valWriter(vv, p._2) + } + gen.writeEndObject() + + case (StructType(ty), v: Row) => + gen.writeStartObject() + ty.zip(v.toSeq).foreach { + case (_, null) => + case (field, v) => + gen.writeFieldName(field.name) + valWriter(field.dataType, v) + } + gen.writeEndObject() + + // For UDT, udt.serialize will produce SQL types. So, we need the following three cases. + case (ArrayType(ty, _), v: ArrayData) => + gen.writeStartArray() + v.foreach(ty, (_, value) => valWriter(ty, value)) + gen.writeEndArray() + + case (MapType(kt, vt, _), v: MapData) => + gen.writeStartObject() + v.foreach(kt, vt, { (k, v) => + gen.writeFieldName(k.toString) + valWriter(vt, v) + }) + gen.writeEndObject() + + case (StructType(ty), v: InternalRow) => + gen.writeStartObject() + var i = 0 + while (i < ty.length) { + val field = ty(i) + val value = v.get(i, field.dataType) + if (value != null) { + gen.writeFieldName(field.name) + valWriter(field.dataType, value) + } + i += 1 + } + gen.writeEndObject() + + case (dt, v) => + sys.error( + s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.") + } + + valWriter(rowSchema, row) + } + /** Transforms a single InternalRow to JSON using Jackson * * TODO: make the code shared with the other apply method. @@ -36,7 +118,7 @@ private[sql] object JacksonGenerator { * @param gen a JsonGenerator object * @param row The row to convert */ - def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = { + def apply(rowSchema: StructType, gen: JsonGenerator, row: InternalRow): Unit = { def valWriter: (DataType, Any) => Unit = { case (_, null) | (NullType, _) => gen.writeNull() case (StringType, v) => gen.writeString(v.toString) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala index a958373eb769d..9703234bd799c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet import java.util.{Map => JMap} -import scala.collection.JavaConverters._ +import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, mapAsScalaMapConverter} import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.api.ReadSupport.ReadContext @@ -29,62 +29,34 @@ import org.apache.parquet.schema.Type.Repetition import org.apache.parquet.schema._ import org.apache.spark.Logging -import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ -/** - * A Parquet [[ReadSupport]] implementation for reading Parquet records as Catalyst - * [[InternalRow]]s. - * - * The API interface of [[ReadSupport]] is a little bit over complicated because of historical - * reasons. In older versions of parquet-mr (say 1.6.0rc3 and prior), [[ReadSupport]] need to be - * instantiated and initialized twice on both driver side and executor side. The [[init()]] method - * is for driver side initialization, while [[prepareForRead()]] is for executor side. However, - * starting from parquet-mr 1.6.0, it's no longer the case, and [[ReadSupport]] is only instantiated - * and initialized on executor side. So, theoretically, now it's totally fine to combine these two - * methods into a single initialization method. The only reason (I could think of) to still have - * them here is for parquet-mr API backwards-compatibility. - * - * Due to this reason, we no longer rely on [[ReadContext]] to pass requested schema from [[init()]] - * to [[prepareForRead()]], but use a private `var` for simplicity. - */ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with Logging { - private var catalystRequestedSchema: StructType = _ - - /** - * Called on executor side before [[prepareForRead()]] and instantiating actual Parquet record - * readers. Responsible for figuring out Parquet requested schema used for column pruning. - */ - override def init(context: InitContext): ReadContext = { - catalystRequestedSchema = { - // scalastyle:off jobcontext - val conf = context.getConfiguration - // scalastyle:on jobcontext - val schemaString = conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA) - assert(schemaString != null, "Parquet requested schema not set.") - StructType.fromString(schemaString) - } - - val parquetRequestedSchema = - CatalystReadSupport.clipParquetSchema(context.getFileSchema, catalystRequestedSchema) - - new ReadContext(parquetRequestedSchema, Map.empty[String, String].asJava) - } - - /** - * Called on executor side after [[init()]], before instantiating actual Parquet record readers. - * Responsible for instantiating [[RecordMaterializer]], which is used for converting Parquet - * records to Catalyst [[InternalRow]]s. - */ + // Called after `init()` when initializing Parquet record reader. override def prepareForRead( conf: Configuration, keyValueMetaData: JMap[String, String], fileSchema: MessageType, readContext: ReadContext): RecordMaterializer[InternalRow] = { log.debug(s"Preparing for read Parquet file with message type: $fileSchema") + + val toCatalyst = new CatalystSchemaConverter(conf) val parquetRequestedSchema = readContext.getRequestedSchema + val catalystRequestedSchema = + Option(readContext.getReadSupportMetadata).map(_.asScala).flatMap { metadata => + metadata + // First tries to read requested schema, which may result from projections + .get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA) + // If not available, tries to read Catalyst schema from file metadata. It's only + // available if the target file is written by Spark SQL. + .orElse(metadata.get(CatalystReadSupport.SPARK_METADATA_KEY)) + }.map(StructType.fromString).getOrElse { + logInfo("Catalyst schema not available, falling back to Parquet schema") + toCatalyst.convert(parquetRequestedSchema) + } + logInfo { s"""Going to read the following fields from the Parquet file: | @@ -99,6 +71,36 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with parquetRequestedSchema, CatalystReadSupport.expandUDT(catalystRequestedSchema)) } + + // Called before `prepareForRead()` when initializing Parquet record reader. + override def init(context: InitContext): ReadContext = { + val conf = { + // scalastyle:off jobcontext + context.getConfiguration + // scalastyle:on jobcontext + } + + // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst + // schema of this file from its metadata. + val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA)) + + // Optional schema of requested columns, in the form of a string serialized from a Catalyst + // `StructType` containing all requested columns. + val maybeRequestedSchema = Option(conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)) + + val parquetRequestedSchema = + maybeRequestedSchema.fold(context.getFileSchema) { schemaString => + val catalystRequestedSchema = StructType.fromString(schemaString) + CatalystReadSupport.clipParquetSchema(context.getFileSchema, catalystRequestedSchema) + } + + val metadata = + Map.empty[String, String] ++ + maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++ + maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _) + + new ReadContext(parquetRequestedSchema, metadata.asJava) + } } private[parquet] object CatalystReadSupport { @@ -268,7 +270,7 @@ private[parquet] object CatalystReadSupport { private def clipParquetGroupFields( parquetRecord: GroupType, structType: StructType): Seq[Type] = { val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap - val toParquet = new CatalystSchemaConverter(writeLegacyParquetFormat = false) + val toParquet = new CatalystSchemaConverter(followParquetFormatSpec = true) structType.map { f => parquetFieldMap .get(f.name) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala index 1f653cd3d3cb1..d848b1e7605fa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala @@ -323,13 +323,7 @@ private[parquet] class CatalystRowConverter( } override def addBinary(value: Binary): Unit = { - // The underlying `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here we - // are using `Binary.toByteBuffer.array()` to steal the underlying byte array without copying - // it. - val buffer = value.toByteBuffer - val offset = buffer.position() - val numBytes = buffer.limit() - buffer.position() - updater.set(UTF8String.fromBytes(buffer.array(), offset, numBytes)) + updater.set(UTF8String.fromBytes(value.getBytes)) } } @@ -374,7 +368,7 @@ private[parquet] class CatalystRowConverter( Decimal(unscaled, precision, scale) } else { // Otherwise, resorts to an unscaled `BigInteger` instead. - Decimal(new BigDecimal(new BigInteger(value.getBytes), scale), precision, scale) + Decimal(new BigDecimal(new BigInteger(bytes), scale), precision, scale) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala index 7f3394c20ed3d..9814960ef3626 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala @@ -41,31 +41,34 @@ import org.apache.spark.sql.{AnalysisException, SQLConf} * @constructor * @param assumeBinaryIsString Whether unannotated BINARY fields should be assumed to be Spark SQL * [[StringType]] fields when converting Parquet a [[MessageType]] to Spark SQL - * [[StructType]]. This argument only affects Parquet read path. + * [[StructType]]. * @param assumeInt96IsTimestamp Whether unannotated INT96 fields should be assumed to be Spark SQL * [[TimestampType]] fields when converting Parquet a [[MessageType]] to Spark SQL * [[StructType]]. Note that Spark SQL [[TimestampType]] is similar to Hive timestamp, which * has optional nanosecond precision, but different from `TIME_MILLS` and `TIMESTAMP_MILLIS` - * described in Parquet format spec. This argument only affects Parquet read path. - * @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.4 - * and prior versions when converting a Catalyst [[StructType]] to a Parquet [[MessageType]]. - * When set to false, use standard format defined in parquet-format spec. This argument only - * affects Parquet write path. + * described in Parquet format spec. + * @param followParquetFormatSpec Whether to generate standard DECIMAL, LIST, and MAP structure when + * converting Spark SQL [[StructType]] to Parquet [[MessageType]]. For Spark 1.4.x and + * prior versions, Spark SQL only supports decimals with a max precision of 18 digits, and + * uses non-standard LIST and MAP structure. Note that the current Parquet format spec is + * backwards-compatible with these settings. If this argument is set to `false`, we fallback + * to old style non-standard behaviors. */ private[parquet] class CatalystSchemaConverter( assumeBinaryIsString: Boolean = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get, assumeInt96IsTimestamp: Boolean = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get, - writeLegacyParquetFormat: Boolean = SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get) { + followParquetFormatSpec: Boolean = SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get +) { def this(conf: SQLConf) = this( assumeBinaryIsString = conf.isParquetBinaryAsString, assumeInt96IsTimestamp = conf.isParquetINT96AsTimestamp, - writeLegacyParquetFormat = conf.writeLegacyParquetFormat) + followParquetFormatSpec = conf.followParquetFormatSpec) def this(conf: Configuration) = this( assumeBinaryIsString = conf.get(SQLConf.PARQUET_BINARY_AS_STRING.key).toBoolean, assumeInt96IsTimestamp = conf.get(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key).toBoolean, - writeLegacyParquetFormat = conf.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key).toBoolean) + followParquetFormatSpec = conf.get(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key).toBoolean) /** * Converts Parquet [[MessageType]] `parquetSchema` to a Spark SQL [[StructType]]. @@ -371,15 +374,15 @@ private[parquet] class CatalystSchemaConverter( case BinaryType => Types.primitive(BINARY, repetition).named(field.name) - // ====================== - // Decimals (legacy mode) - // ====================== + // ===================================== + // Decimals (for Spark version <= 1.4.x) + // ===================================== // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and // always store decimals in fixed-length byte arrays. To keep compatibility with these older // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated // by `DECIMAL`. - case DecimalType.Fixed(precision, scale) if writeLegacyParquetFormat => + case DecimalType.Fixed(precision, scale) if !followParquetFormatSpec => Types .primitive(FIXED_LEN_BYTE_ARRAY, repetition) .as(DECIMAL) @@ -388,13 +391,13 @@ private[parquet] class CatalystSchemaConverter( .length(CatalystSchemaConverter.minBytesForPrecision(precision)) .named(field.name) - // ======================== - // Decimals (standard mode) - // ======================== + // ===================================== + // Decimals (follow Parquet format spec) + // ===================================== // Uses INT32 for 1 <= precision <= 9 case DecimalType.Fixed(precision, scale) - if precision <= MAX_PRECISION_FOR_INT32 && !writeLegacyParquetFormat => + if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec => Types .primitive(INT32, repetition) .as(DECIMAL) @@ -404,7 +407,7 @@ private[parquet] class CatalystSchemaConverter( // Uses INT64 for 1 <= precision <= 18 case DecimalType.Fixed(precision, scale) - if precision <= MAX_PRECISION_FOR_INT64 && !writeLegacyParquetFormat => + if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec => Types .primitive(INT64, repetition) .as(DECIMAL) @@ -413,7 +416,7 @@ private[parquet] class CatalystSchemaConverter( .named(field.name) // Uses FIXED_LEN_BYTE_ARRAY for all other precisions - case DecimalType.Fixed(precision, scale) if !writeLegacyParquetFormat => + case DecimalType.Fixed(precision, scale) if followParquetFormatSpec => Types .primitive(FIXED_LEN_BYTE_ARRAY, repetition) .as(DECIMAL) @@ -422,15 +425,15 @@ private[parquet] class CatalystSchemaConverter( .length(CatalystSchemaConverter.minBytesForPrecision(precision)) .named(field.name) - // =================================== - // ArrayType and MapType (legacy mode) - // =================================== + // =================================================== + // ArrayType and MapType (for Spark versions <= 1.4.x) + // =================================================== // Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level // `LIST` structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element // field name "array" is borrowed from parquet-avro. - case ArrayType(elementType, nullable @ true) if writeLegacyParquetFormat => + case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec => // group (LIST) { // optional group bag { // repeated array; @@ -448,7 +451,7 @@ private[parquet] class CatalystSchemaConverter( // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level // LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is // covered by the backwards-compatibility rules implemented in `isElementType()`. - case ArrayType(elementType, nullable @ false) if writeLegacyParquetFormat => + case ArrayType(elementType, nullable @ false) if !followParquetFormatSpec => // group (LIST) { // repeated element; // } @@ -460,7 +463,7 @@ private[parquet] class CatalystSchemaConverter( // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by // MAP_KEY_VALUE. This is covered by `convertGroupField(field: GroupType): DataType`. - case MapType(keyType, valueType, valueContainsNull) if writeLegacyParquetFormat => + case MapType(keyType, valueType, valueContainsNull) if !followParquetFormatSpec => // group (MAP) { // repeated group map (MAP_KEY_VALUE) { // required key; @@ -473,11 +476,11 @@ private[parquet] class CatalystSchemaConverter( convertField(StructField("key", keyType, nullable = false)), convertField(StructField("value", valueType, valueContainsNull))) - // ===================================== - // ArrayType and MapType (standard mode) - // ===================================== + // ================================================== + // ArrayType and MapType (follow Parquet format spec) + // ================================================== - case ArrayType(elementType, containsNull) if !writeLegacyParquetFormat => + case ArrayType(elementType, containsNull) if followParquetFormatSpec => // group (LIST) { // repeated group list { // element; @@ -560,9 +563,9 @@ private[parquet] object CatalystSchemaConverter { // Returns the minimum number of bytes needed to store a decimal with a given `precision`. val minBytesForPrecision = Array.tabulate[Int](39)(computeMinBytesForPrecision) - val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4) /* 9 */ + val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4) - val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8) /* 18 */ + val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8) // Max precision of a decimal value stored in `numBytes` bytes def maxPrecisionForBytes(numBytes: Int): Int = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala index 5a7c6b95b565f..3024f6040e02e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala @@ -590,7 +590,7 @@ private[sql] object ParquetRelation extends Logging { val converter = new CatalystSchemaConverter( sqlContext.conf.isParquetBinaryAsString, sqlContext.conf.isParquetBinaryAsString, - sqlContext.conf.writeLegacyParquetFormat) + sqlContext.conf.followParquetFormatSpec) converter.convert(schema) } @@ -724,7 +724,7 @@ private[sql] object ParquetRelation extends Logging { filesToTouch: Seq[FileStatus], sqlContext: SQLContext): Option[StructType] = { val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp - val writeLegacyParquetFormat = sqlContext.conf.writeLegacyParquetFormat + val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec val serializedConf = new SerializableConfiguration(sqlContext.sparkContext.hadoopConfiguration) // !! HACK ALERT !! @@ -764,7 +764,7 @@ private[sql] object ParquetRelation extends Logging { new CatalystSchemaConverter( assumeBinaryIsString = assumeBinaryIsString, assumeInt96IsTimestamp = assumeInt96IsTimestamp, - writeLegacyParquetFormat = writeLegacyParquetFormat) + followParquetFormatSpec = followParquetFormatSpec) footers.map { footer => ParquetRelation.readSchemaFromFooter(footer, converter) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 1a8e7ab202dc2..50f40e2f9e9f5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -61,7 +61,7 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] { // We are inserting into an InsertableRelation or HadoopFsRelation. case i @ InsertIntoTable( - l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _), _, child, _, _) => { + l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation), _, child, _, _) => { // First, make sure the data to be inserted have the same number of fields with the // schema of the relation. if (l.output.size != child.output.size) { @@ -108,14 +108,14 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan => def apply(plan: LogicalPlan): Unit = { plan.foreach { case i @ logical.InsertIntoTable( - l @ LogicalRelation(t: InsertableRelation, _), partition, query, overwrite, ifNotExists) => + l @ LogicalRelation(t: InsertableRelation), partition, query, overwrite, ifNotExists) => // Right now, we do not support insert into a data source table with partition specs. if (partition.nonEmpty) { failAnalysis(s"Insert into a partition is not allowed because $l is not partitioned.") } else { // Get all input data source relations of the query. val srcRelations = query.collect { - case LogicalRelation(src: BaseRelation, _) => src + case LogicalRelation(src: BaseRelation) => src } if (srcRelations.contains(t)) { failAnalysis( @@ -126,7 +126,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan => } case logical.InsertIntoTable( - LogicalRelation(r: HadoopFsRelation, _), part, query, overwrite, _) => + LogicalRelation(r: HadoopFsRelation), part, query, overwrite, _) => // We need to make sure the partition columns specified by users do match partition // columns of the relation. val existingPartitionColumns = r.partitionColumns.fieldNames.toSet @@ -145,7 +145,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan => // Get all input data source relations of the query. val srcRelations = query.collect { - case LogicalRelation(src: BaseRelation, _) => src + case LogicalRelation(src: BaseRelation) => src } if (srcRelations.contains(r)) { failAnalysis( @@ -173,10 +173,10 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan => EliminateSubQueries(catalog.lookupRelation(tableIdent)) match { // Only do the check if the table is a data source table // (the relation is a BaseRelation). - case l @ LogicalRelation(dest: BaseRelation, _) => + case l @ LogicalRelation(dest: BaseRelation) => // Get all input data source relations of the query. val srcRelations = query.collect { - case LogicalRelation(src: BaseRelation, _) => src + case LogicalRelation(src: BaseRelation) => src } if (srcRelations.contains(dest)) { failAnalysis( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala similarity index 97% rename from sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala index d611b0011da16..86f694c949085 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala @@ -119,15 +119,6 @@ object EvaluatePython { def apply(udf: PythonUDF, child: LogicalPlan): EvaluatePython = new EvaluatePython(udf, child, AttributeReference("pythonUDF", udf.dataType)()) - def takeAndServe(df: DataFrame, n: Int): Int = { - registerPicklers() - val iter = new SerDeUtil.AutoBatchedPickler( - df.queryExecution.executedPlan.executeTake(n).iterator.map { row => - EvaluatePython.toJava(row, df.schema) - }) - PythonRDD.serveIterator(iter, s"serve-DataFrame") - } - /** * Helper for converting from Catalyst type to java type suitable for Pyrolite. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala index 8d4854b698ed7..6c5157486c12e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala @@ -37,20 +37,22 @@ private[sql] class ExamplePointUDT extends UserDefinedType[ExamplePoint] { override def pyUDT: String = "pyspark.sql.tests.ExamplePointUDT" - override def serialize(obj: Any): GenericArrayData = { + override def serialize(obj: Any): Seq[Double] = { obj match { case p: ExamplePoint => - val output = new Array[Any](2) - output(0) = p.x - output(1) = p.y - new GenericArrayData(output) + Seq(p.x, p.y) } } override def deserialize(datum: Any): ExamplePoint = { datum match { - case values: ArrayData => - new ExamplePoint(values.getDouble(0), values.getDouble(1)) + case values: Seq[_] => + val xy = values.asInstanceOf[Seq[Double]] + assert(xy.length == 2) + new ExamplePoint(xy(0), xy(1)) + case values: util.ArrayList[_] => + val xy = values.asInstanceOf[util.ArrayList[Double]].asScala + new ExamplePoint(xy(0), xy(1)) } } diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java index 40bff57a17a03..f56941c5853e6 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java @@ -141,7 +141,11 @@ public List getD() { } } - void validateDataFrameWithBeans(Bean bean, DataFrame df) { + @Test + public void testCreateDataFrameFromJavaBeans() { + Bean bean = new Bean(); + JavaRDD rdd = jsc.parallelize(Arrays.asList(bean)); + DataFrame df = context.createDataFrame(rdd, Bean.class); StructType schema = df.schema(); Assert.assertEquals(new StructField("a", DoubleType$.MODULE$, false, Metadata.empty()), schema.apply("a")); @@ -177,22 +181,6 @@ void validateDataFrameWithBeans(Bean bean, DataFrame df) { } } - @Test - public void testCreateDataFrameFromLocalJavaBeans() { - Bean bean = new Bean(); - List data = Arrays.asList(bean); - DataFrame df = context.createDataFrame(data, Bean.class); - validateDataFrameWithBeans(bean, df); - } - - @Test - public void testCreateDataFrameFromJavaBeans() { - Bean bean = new Bean(); - JavaRDD rdd = jsc.parallelize(Arrays.asList(bean)); - DataFrame df = context.createDataFrame(rdd, Bean.class); - validateDataFrameWithBeans(bean, df); - } - @Test public void testCreateDataFromFromList() { StructType schema = createStructType(Arrays.asList(createStructField("i", IntegerType, true))); diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index e3531d0d6d799..045fea82e4c89 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -29,42 +29,4 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext { Row("alice", "5")) } - - val tuples: Seq[(String, String)] = - ("1", """{"f1": "value1", "f2": "value2", "f3": 3, "f5": 5.23}""") :: - ("2", """{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") :: - ("3", """{"f1": "value13", "f4": "value44", "f3": "value33", "f2": 2, "f5": 5.01}""") :: - ("4", null) :: - ("5", """{"f1": "", "f5": null}""") :: - ("6", "[invalid JSON string]") :: - Nil - - test("json_tuple select") { - val df: DataFrame = tuples.toDF("key", "jstring") - val expected = Row("1", Row("value1", "value2", "3", null, "5.23")) :: - Row("2", Row("value12", "2", "value3", "4.01", null)) :: - Row("3", Row("value13", "2", "value33", "value44", "5.01")) :: - Row("4", Row(null, null, null, null, null)) :: - Row("5", Row("", null, null, null, null)) :: - Row("6", Row(null, null, null, null, null)) :: - Nil - - checkAnswer(df.selectExpr("key", "json_tuple(jstring, 'f1', 'f2', 'f3', 'f4', 'f5')"), expected) - } - - test("json_tuple filter and group") { - val df: DataFrame = tuples.toDF("key", "jstring") - val expr = df - .selectExpr("json_tuple(jstring, 'f1', 'f2') as jt") - .where($"jt.c0".isNotNull) - .groupBy($"jt.c1") - .count() - - val expected = Row(null, 1) :: - Row("2", 2) :: - Row("value2", 1) :: - Nil - - checkAnswer(expr, expected) - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala index 58f982c2bc932..30289c3c1d097 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala @@ -37,11 +37,9 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext { private lazy val nullDoubles = Seq(NullDoubles(1.0), NullDoubles(2.0), NullDoubles(3.0), NullDoubles(null)).toDF() - private def testOneToOneMathFunction[ - @specialized(Int, Long, Float, Double) T, - @specialized(Int, Long, Float, Double) U]( + private def testOneToOneMathFunction[@specialized(Int, Long, Float, Double) T]( c: Column => Column, - f: T => U): Unit = { + f: T => T): Unit = { checkAnswer( doubleData.select(c('a)), (1 to 10).map(n => Row(f((n * 0.2 - 1).asInstanceOf[T]))) @@ -167,10 +165,10 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext { } test("ceil and ceiling") { - testOneToOneMathFunction(ceil, (d: Double) => math.ceil(d).toLong) + testOneToOneMathFunction(ceil, math.ceil) checkAnswer( sql("SELECT ceiling(0), ceiling(1), ceiling(1.5)"), - Row(0L, 1L, 2L)) + Row(0.0, 1.0, 2.0)) } test("conv") { @@ -186,7 +184,7 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext { } test("floor") { - testOneToOneMathFunction(floor, (d: Double) => math.floor(d).toLong) + testOneToOneMathFunction(floor, math.floor) } test("factorial") { @@ -230,7 +228,7 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext { } test("signum / sign") { - testOneToOneMathFunction[Double, Double](signum, math.signum) + testOneToOneMathFunction[Double](signum, math.signum) checkAnswer( sql("SELECT sign(10), signum(-11)"), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala index 6265e40a0a07b..4efebe985c2bd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala @@ -212,11 +212,4 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { // Drop the cache. cached.unpersist() } - - test("SPARK-10859: Predicates pushed to InMemoryColumnarTableScan are not evaluated correctly") { - val data = sqlContext.range(10).selectExpr("id", "cast(id as string) as s") - data.cache() - assert(data.count() === 10) - assert(data.filter($"s" === "3").count() === 1) - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala index 8549a6a0f6643..3d218f01c9ead 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala @@ -245,7 +245,7 @@ object SparkPlanTest { } } ) - resolvedPlan.executeCollectPublic().toSeq + resolvedPlan.executeCollect().toSeq } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala index 09e258299de5a..c0e9b3ffb2f67 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution -import java.io.{File, ByteArrayInputStream, ByteArrayOutputStream} +import java.io.{File, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.memory.TaskMemoryManager @@ -43,7 +43,7 @@ class ClosableByteArrayInputStream(buf: Array[Byte]) extends ByteArrayInputStrea } } -class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext { +class UnsafeRowSerializerSuite extends SparkFunSuite { private def toUnsafeRow(row: Row, schema: Array[DataType]): UnsafeRow = { val converter = unsafeRowConverter(schema) @@ -89,7 +89,11 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext { } test("close empty input stream") { - val input = new ClosableByteArrayInputStream(Array.empty) + val baos = new ByteArrayOutputStream() + val dout = new DataOutputStream(baos) + dout.writeInt(-1) // EOF + dout.flush() + val input = new ClosableByteArrayInputStream(baos.toByteArray) val serializer = new UnsafeRowSerializer(numFields = 2).newInstance() val deserializerIter = serializer.deserializeStream(input).asKeyValueIterator assert(!deserializerIter.hasNext) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index c24c9f025dad7..cf6640e3f1113 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -55,7 +55,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex .where(Column(predicate)) val analyzedPredicate = query.queryExecution.optimizedPlan.collect { - case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation, _)) => filters + case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation)) => filters }.flatten assert(analyzedPredicate.nonEmpty) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala index 61cc0da50865c..942607a2059ec 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala @@ -513,7 +513,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath) val queryExecution = sqlContext.read.parquet(dir.getCanonicalPath).queryExecution queryExecution.analyzed.collectFirst { - case LogicalRelation(relation: ParquetRelation, _) => + case LogicalRelation(relation: ParquetRelation) => assert(relation.partitionSpec === PartitionSpec.emptySpec) }.getOrElse { fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index 60fa81b1ab819..24f70b849fac5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala @@ -22,6 +22,7 @@ import scala.reflect.runtime.universe.TypeTag import org.apache.parquet.schema.MessageTypeParser +import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ @@ -34,29 +35,32 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext { protected def testSchemaInference[T <: Product: ClassTag: TypeTag]( testName: String, messageType: String, - binaryAsString: Boolean, - int96AsTimestamp: Boolean, - writeLegacyParquetFormat: Boolean): Unit = { + binaryAsString: Boolean = true, + int96AsTimestamp: Boolean = true, + followParquetFormatSpec: Boolean = false, + isThriftDerived: Boolean = false): Unit = { testSchema( testName, StructType.fromAttributes(ScalaReflection.attributesFor[T]), messageType, binaryAsString, int96AsTimestamp, - writeLegacyParquetFormat) + followParquetFormatSpec, + isThriftDerived) } protected def testParquetToCatalyst( testName: String, sqlSchema: StructType, parquetSchema: String, - binaryAsString: Boolean, - int96AsTimestamp: Boolean, - writeLegacyParquetFormat: Boolean): Unit = { + binaryAsString: Boolean = true, + int96AsTimestamp: Boolean = true, + followParquetFormatSpec: Boolean = false, + isThriftDerived: Boolean = false): Unit = { val converter = new CatalystSchemaConverter( assumeBinaryIsString = binaryAsString, assumeInt96IsTimestamp = int96AsTimestamp, - writeLegacyParquetFormat = writeLegacyParquetFormat) + followParquetFormatSpec = followParquetFormatSpec) test(s"sql <= parquet: $testName") { val actual = converter.convert(MessageTypeParser.parseMessageType(parquetSchema)) @@ -74,13 +78,14 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext { testName: String, sqlSchema: StructType, parquetSchema: String, - binaryAsString: Boolean, - int96AsTimestamp: Boolean, - writeLegacyParquetFormat: Boolean): Unit = { + binaryAsString: Boolean = true, + int96AsTimestamp: Boolean = true, + followParquetFormatSpec: Boolean = false, + isThriftDerived: Boolean = false): Unit = { val converter = new CatalystSchemaConverter( assumeBinaryIsString = binaryAsString, assumeInt96IsTimestamp = int96AsTimestamp, - writeLegacyParquetFormat = writeLegacyParquetFormat) + followParquetFormatSpec = followParquetFormatSpec) test(s"sql => parquet: $testName") { val actual = converter.convert(sqlSchema) @@ -94,9 +99,10 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext { testName: String, sqlSchema: StructType, parquetSchema: String, - binaryAsString: Boolean, - int96AsTimestamp: Boolean, - writeLegacyParquetFormat: Boolean): Unit = { + binaryAsString: Boolean = true, + int96AsTimestamp: Boolean = true, + followParquetFormatSpec: Boolean = false, + isThriftDerived: Boolean = false): Unit = { testCatalystToParquet( testName, @@ -104,7 +110,8 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext { parquetSchema, binaryAsString, int96AsTimestamp, - writeLegacyParquetFormat) + followParquetFormatSpec, + isThriftDerived) testParquetToCatalyst( testName, @@ -112,7 +119,8 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext { parquetSchema, binaryAsString, int96AsTimestamp, - writeLegacyParquetFormat) + followParquetFormatSpec, + isThriftDerived) } } @@ -129,9 +137,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | optional binary _6; |} """.stripMargin, - binaryAsString = false, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + binaryAsString = false) testSchemaInference[(Byte, Short, Int, Long, java.sql.Date)]( "logical integral types", @@ -143,10 +149,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | required int64 _4 (INT_64); | optional int32 _5 (DATE); |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testSchemaInference[Tuple1[String]]( "string", @@ -155,9 +158,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | optional binary _1 (UTF8); |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + binaryAsString = true) testSchemaInference[Tuple1[String]]( "binary enum as string", @@ -165,10 +166,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |message root { | optional binary _1 (ENUM); |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testSchemaInference[Tuple1[Seq[Int]]]( "non-nullable array - non-standard", @@ -178,10 +176,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | repeated int32 array; | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testSchemaInference[Tuple1[Seq[Int]]]( "non-nullable array - standard", @@ -194,9 +189,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testSchemaInference[Tuple1[Seq[Integer]]]( "nullable array - non-standard", @@ -208,10 +201,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testSchemaInference[Tuple1[Seq[Integer]]]( "nullable array - standard", @@ -224,9 +214,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testSchemaInference[Tuple1[Map[Int, String]]]( "map - standard", @@ -240,9 +228,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testSchemaInference[Tuple1[Map[Int, String]]]( "map - non-standard", @@ -255,10 +241,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testSchemaInference[Tuple1[Pair[Int, String]]]( "struct", @@ -270,9 +253,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]]( "deeply nested type - non-standard", @@ -295,10 +276,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]]( "deeply nested type - standard", @@ -322,9 +300,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testSchemaInference[(Option[Int], Map[Int, Option[Double]])]( "optional types", @@ -339,9 +315,36 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) + + // Parquet files generated by parquet-thrift are already handled by the schema converter, but + // let's leave this test here until both read path and write path are all updated. + ignore("thrift generated parquet schema") { + // Test for SPARK-4520 -- ensure that thrift generated parquet schema is generated + // as expected from attributes + testSchemaInference[( + Array[Byte], Array[Byte], Array[Byte], Seq[Int], Map[Array[Byte], Seq[Int]])]( + "thrift generated parquet schema", + """ + |message root { + | optional binary _1 (UTF8); + | optional binary _2 (UTF8); + | optional binary _3 (UTF8); + | optional group _4 (LIST) { + | repeated int32 _4_tuple; + | } + | optional group _5 (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional group value (LIST) { + | repeated int32 value_tuple; + | } + | } + | } + |} + """.stripMargin, + isThriftDerived = true) + } } class ParquetSchemaSuite extends ParquetSchemaTest { @@ -467,10 +470,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: LIST with nullable element type - 2", @@ -486,10 +486,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 1 - standard", @@ -502,10 +499,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 2", @@ -518,10 +512,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 3", @@ -532,10 +523,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | repeated int32 element; | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 4", @@ -556,10 +544,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 5 - parquet-avro style", @@ -578,10 +563,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 6 - parquet-thrift style", @@ -600,10 +582,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type 7 - " + @@ -613,10 +592,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | repeated int32 f1; |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type 8 - " + @@ -636,10 +612,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | required int32 c2; | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) // ======================================================= // Tests for converting Catalyst ArrayType to Parquet LIST @@ -660,9 +633,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testCatalystToParquet( "Backwards-compatibility: LIST with nullable element type - 2 - prior to 1.4.x", @@ -678,10 +649,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testCatalystToParquet( "Backwards-compatibility: LIST with non-nullable element type - 1 - standard", @@ -698,9 +666,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testCatalystToParquet( "Backwards-compatibility: LIST with non-nullable element type - 2 - prior to 1.4.x", @@ -714,10 +680,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | repeated int32 array; | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) // ==================================================== // Tests for converting Parquet Map to Catalyst MapType @@ -738,10 +701,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: MAP with non-nullable value type - 2", @@ -758,10 +718,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.4.x", @@ -778,10 +735,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: MAP with nullable value type - 1 - standard", @@ -798,10 +752,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: MAP with nullable value type - 2", @@ -818,10 +769,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testParquetToCatalyst( "Backwards-compatibility: MAP with nullable value type - 3 - parquet-avro style", @@ -838,10 +786,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) // ==================================================== // Tests for converting Catalyst MapType to Parquet Map @@ -863,9 +808,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testCatalystToParquet( "Backwards-compatibility: MAP with non-nullable value type - 2 - prior to 1.4.x", @@ -882,10 +825,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testCatalystToParquet( "Backwards-compatibility: MAP with nullable value type - 1 - standard", @@ -903,9 +843,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testCatalystToParquet( "Backwards-compatibility: MAP with nullable value type - 3 - prior to 1.4.x", @@ -922,10 +860,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) // ================================= // Tests for conversion for decimals @@ -938,9 +873,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional int32 f1 (DECIMAL(1, 0)); |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testSchema( "DECIMAL(8, 3) - standard", @@ -949,9 +882,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional int32 f1 (DECIMAL(8, 3)); |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testSchema( "DECIMAL(9, 3) - standard", @@ -960,9 +891,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional int32 f1 (DECIMAL(9, 3)); |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testSchema( "DECIMAL(18, 3) - standard", @@ -971,9 +900,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional int64 f1 (DECIMAL(18, 3)); |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testSchema( "DECIMAL(19, 3) - standard", @@ -982,9 +909,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional fixed_len_byte_array(9) f1 (DECIMAL(19, 3)); |} """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = false) + followParquetFormatSpec = true) testSchema( "DECIMAL(1, 0) - prior to 1.4.x", @@ -992,10 +917,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | optional fixed_len_byte_array(1) f1 (DECIMAL(1, 0)); |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testSchema( "DECIMAL(8, 3) - prior to 1.4.x", @@ -1003,10 +925,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | optional fixed_len_byte_array(4) f1 (DECIMAL(8, 3)); |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testSchema( "DECIMAL(9, 3) - prior to 1.4.x", @@ -1014,10 +933,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3)); |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) testSchema( "DECIMAL(18, 3) - prior to 1.4.x", @@ -1025,10 +941,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | optional fixed_len_byte_array(8) f1 (DECIMAL(18, 3)); |} - """.stripMargin, - binaryAsString = true, - int96AsTimestamp = true, - writeLegacyParquetFormat = true) + """.stripMargin) private def testSchemaClipping( testName: String, diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala index a4fd0c3ce9702..a0643cec0fb7c 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala @@ -55,6 +55,7 @@ object HiveThriftServer2 extends Logging { @DeveloperApi def startWithContext(sqlContext: HiveContext): Unit = { val server = new HiveThriftServer2(sqlContext) + sqlContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion) server.init(sqlContext.hiveconf) server.start() listener = new HiveThriftServer2Listener(server, sqlContext.conf) diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala index ff8ca0150649d..6dbcec1aa0f71 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala @@ -21,7 +21,6 @@ import java.io.File import java.net.URL import java.sql.{Date, DriverManager, SQLException, Statement} -import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.duration._ @@ -436,32 +435,6 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { } ) } - - test("Checks Hive version via SET -v") { - withJdbcStatement { statement => - val resultSet = statement.executeQuery("SET -v") - - val conf = mutable.Map.empty[String, String] - while (resultSet.next()) { - conf += resultSet.getString(1) -> resultSet.getString(2) - } - - assert(conf.get("spark.sql.hive.version") === Some("1.2.1")) - } - } - - test("Checks Hive version via SET") { - withJdbcStatement { statement => - val resultSet = statement.executeQuery("SET") - - val conf = mutable.Map.empty[String, String] - while (resultSet.next()) { - conf += resultSet.getString(1) -> resultSet.getString(2) - } - - assert(conf.get("spark.sql.hive.version") === Some("1.2.1")) - } - } } class HiveThriftHttpServerSuite extends HiveThriftJdbcTest { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index 2d72b959af134..e7963c08de079 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -607,7 +607,7 @@ class HiveContext private[hive]( /** Extends QueryExecution with hive specific features. */ protected[sql] class QueryExecution(logicalPlan: LogicalPlan) - extends org.apache.spark.sql.execution.QueryExecution(this, logicalPlan) { + extends super.QueryExecution(logicalPlan) { /** * Returns the result as a hive compatible sequence of strings. For native commands, the @@ -625,10 +625,10 @@ class HiveContext private[hive]( .mkString("\t") } case command: ExecutedCommand => - command.executeCollect().map(_.getString(0)) + command.executeCollect().map(_(0).toString) case other => - val result: Seq[Seq[Any]] = other.executeCollectPublic().map(_.toSeq).toSeq + val result: Seq[Seq[Any]] = other.executeCollect().map(_.toSeq).toSeq // We need the types so we can output struct field names val types = analyzed.output.map(_.dataType) // Reformat to match hive tab delimited output. @@ -662,11 +662,6 @@ private[hive] object HiveContext { doc = "Version of the Hive metastore. Available options are " + s"0.12.0 through $hiveExecutionVersion.") - val HIVE_EXECUTION_VERSION = stringConf( - key = "spark.sql.hive.version", - defaultValue = Some(hiveExecutionVersion), - doc = "Version of Hive used internally by Spark SQL.") - val HIVE_METASTORE_JARS = stringConf("spark.sql.hive.metastore.jars", defaultValue = Some("builtin"), doc = s""" diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index f4d45714fae4e..287702e92998e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -422,7 +422,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive partitionSpecInMetastore: Option[PartitionSpec]): Option[LogicalRelation] = { cachedDataSourceTables.getIfPresent(tableIdentifier) match { case null => None // Cache miss - case logical @ LogicalRelation(parquetRelation: ParquetRelation, _) => + case logical @ LogicalRelation(parquetRelation: ParquetRelation) => // If we have the same paths, same schema, and same partition spec, // we will use the cached Parquet Relation. val useCached = @@ -488,7 +488,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive parquetRelation } - result.copy(expectedOutputAttributes = Some(metastoreRelation.output)) + result.newInstance() } override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = { @@ -507,28 +507,60 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive return plan } - plan transformUp { + // Collects all `MetastoreRelation`s which should be replaced + val toBeReplaced = plan.collect { // Write path - case InsertIntoTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists) - // Inserting into partitioned table is not supported in Parquet data source (yet). - if !r.hiveQlTable.isPartitioned && hive.convertMetastoreParquet && - r.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") => - val parquetRelation = convertToParquetRelation(r) - InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists) + case InsertIntoTable(relation: MetastoreRelation, _, _, _, _) + // Inserting into partitioned table is not supported in Parquet data source (yet). + if !relation.hiveQlTable.isPartitioned && + hive.convertMetastoreParquet && + relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") => + val parquetRelation = convertToParquetRelation(relation) + val attributedRewrites = relation.output.zip(parquetRelation.output) + (relation, parquetRelation, attributedRewrites) // Write path - case InsertIntoHiveTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists) + case InsertIntoHiveTable(relation: MetastoreRelation, _, _, _, _) // Inserting into partitioned table is not supported in Parquet data source (yet). - if !r.hiveQlTable.isPartitioned && hive.convertMetastoreParquet && - r.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") => - val parquetRelation = convertToParquetRelation(r) - InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists) + if !relation.hiveQlTable.isPartitioned && + hive.convertMetastoreParquet && + relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") => + val parquetRelation = convertToParquetRelation(relation) + val attributedRewrites = relation.output.zip(parquetRelation.output) + (relation, parquetRelation, attributedRewrites) // Read path case relation: MetastoreRelation if hive.convertMetastoreParquet && - relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") => + relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") => val parquetRelation = convertToParquetRelation(relation) - Subquery(relation.alias.getOrElse(relation.tableName), parquetRelation) + val attributedRewrites = relation.output.zip(parquetRelation.output) + (relation, parquetRelation, attributedRewrites) + } + + val relationMap = toBeReplaced.map(r => (r._1, r._2)).toMap + val attributedRewrites = AttributeMap(toBeReplaced.map(_._3).fold(Nil)(_ ++: _)) + + // Replaces all `MetastoreRelation`s with corresponding `ParquetRelation2`s, and fixes + // attribute IDs referenced in other nodes. + plan.transformUp { + case r: MetastoreRelation if relationMap.contains(r) => + val parquetRelation = relationMap(r) + val alias = r.alias.getOrElse(r.tableName) + Subquery(alias, parquetRelation) + + case InsertIntoTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists) + if relationMap.contains(r) => + val parquetRelation = relationMap(r) + InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists) + + case InsertIntoHiveTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists) + if relationMap.contains(r) => + val parquetRelation = relationMap(r) + InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists) + + case other => other.transformExpressions { + case a: Attribute if a.resolved => attributedRewrites.getOrElse(a, a) + } } } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index f936cf565b2bc..0c700bdb370ac 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -124,7 +124,7 @@ case class InsertIntoHiveTable( * * Note: this is run once and then kept to avoid double insertions. */ - protected[sql] lazy val sideEffectResult: Seq[InternalRow] = { + protected[sql] lazy val sideEffectResult: Seq[Row] = { // Have to pass the TableDesc object to RDD.mapPartitions and then instantiate new serializer // instances within the closure, since Serializer is not serializable while TableDesc is. val tableDesc = table.tableDesc @@ -267,10 +267,10 @@ case class InsertIntoHiveTable( // however for now we return an empty list to simplify compatibility checks with hive, which // does not return anything for insert operations. // TODO: implement hive compatibility as rules. - Seq.empty[InternalRow] + Seq.empty[Row] } - override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray + override def executeCollect(): Array[Row] = sideEffectResult.toArray protected override def doExecute(): RDD[InternalRow] = { sqlContext.sparkContext.parallelize(sideEffectResult.asInstanceOf[Seq[InternalRow]], 1) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala index a9db70119d011..2ec9cf1d4075e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala @@ -37,7 +37,6 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule @@ -158,7 +157,7 @@ private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, childre @transient private lazy val conversionHelper = new ConversionHelper(method, arguments) - override val dataType = javaClassToDataType(method.getReturnType) + val dataType = javaClassToDataType(method.getReturnType) @transient lazy val returnInspector = ObjectInspectorFactory.getReflectionObjectInspector( @@ -229,7 +228,7 @@ private[hive] case class HiveGenericUDF(funcWrapper: HiveFunctionWrapper, childr new DeferredObjectAdapter(inspect, child.dataType) }.toArray[DeferredObject] - override val dataType: DataType = inspectorToDataType(returnInspector) + lazy val dataType: DataType = inspectorToDataType(returnInspector) override def eval(input: InternalRow): Any = { returnInspector // Make sure initialized. @@ -255,12 +254,6 @@ private[hive] case class HiveGenericUDF(funcWrapper: HiveFunctionWrapper, childr * Resolves [[UnresolvedWindowFunction]] to [[HiveWindowFunction]]. */ private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] { - private def shouldResolveFunction( - unresolvedWindowFunction: UnresolvedWindowFunction, - windowSpec: WindowSpecDefinition): Boolean = { - unresolvedWindowFunction.childrenResolved && windowSpec.childrenResolved - } - def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { case p: LogicalPlan if !p.childrenResolved => p @@ -268,11 +261,9 @@ private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] { // replaced those WindowSpecReferences. case p: LogicalPlan => p transformExpressions { - // We will not start to resolve the function unless all arguments are resolved - // and all expressions in window spec are fixed. case WindowExpression( - u @ UnresolvedWindowFunction(name, children), - windowSpec: WindowSpecDefinition) if shouldResolveFunction(u, windowSpec) => + UnresolvedWindowFunction(name, children), + windowSpec: WindowSpecDefinition) => // First, let's find the window function info. val windowFunctionInfo: WindowFunctionInfo = Option(FunctionRegistry.getWindowFunctionInfo(name.toLowerCase)).getOrElse( @@ -288,7 +279,7 @@ private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] { // are expressions in Order By clause. if (classOf[GenericUDAFRank].isAssignableFrom(functionClass)) { if (children.nonEmpty) { - throw new AnalysisException(s"$name does not take input parameters.") + throw new AnalysisException(s"$name does not take input parameters.") } windowSpec.orderSpec.map(_.child) } else { @@ -390,7 +381,7 @@ private[hive] case class HiveWindowFunction( evaluator.init(GenericUDAFEvaluator.Mode.COMPLETE, inputInspectors) } - override val dataType: DataType = + override def dataType: DataType = if (!pivotResult) { inspectorToDataType(returnInspector) } else { @@ -475,6 +466,70 @@ private[hive] case class HiveWindowFunction( new HiveWindowFunction(funcWrapper, pivotResult, isUDAFBridgeRequired, children) } +private[hive] case class HiveGenericUDAF( + funcWrapper: HiveFunctionWrapper, + children: Seq[Expression]) extends AggregateExpression1 + with HiveInspectors { + + type UDFType = AbstractGenericUDAFResolver + + @transient + protected lazy val resolver: AbstractGenericUDAFResolver = funcWrapper.createFunction() + + @transient + protected lazy val objectInspector = { + val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false) + resolver.getEvaluator(parameterInfo) + .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray) + } + + @transient + protected lazy val inspectors = children.map(toInspector) + + def dataType: DataType = inspectorToDataType(objectInspector) + + def nullable: Boolean = true + + override def toString: String = { + s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})" + } + + def newInstance(): HiveUDAFFunction = new HiveUDAFFunction(funcWrapper, children, this) +} + +/** It is used as a wrapper for the hive functions which uses UDAF interface */ +private[hive] case class HiveUDAF( + funcWrapper: HiveFunctionWrapper, + children: Seq[Expression]) extends AggregateExpression1 + with HiveInspectors { + + type UDFType = UDAF + + @transient + protected lazy val resolver: AbstractGenericUDAFResolver = + new GenericUDAFBridge(funcWrapper.createFunction()) + + @transient + protected lazy val objectInspector = { + val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false) + resolver.getEvaluator(parameterInfo) + .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray) + } + + @transient + protected lazy val inspectors = children.map(toInspector) + + def dataType: DataType = inspectorToDataType(objectInspector) + + def nullable: Boolean = true + + override def toString: String = { + s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})" + } + + def newInstance(): HiveUDAFFunction = new HiveUDAFFunction(funcWrapper, children, this, true) +} + /** * Converts a Hive Generic User Defined Table Generating Function (UDTF) to a * [[Generator]]. Note that the semantics of Generators do not allow @@ -554,10 +609,6 @@ private[hive] case class HiveGenericUDTF( } } -/** - * Currently we don't support partial aggregation for queries using Hive UDAF, which may hurt - * performance a lot. - */ private[hive] case class HiveUDAFFunction( funcWrapper: HiveFunctionWrapper, children: Seq[Expression], @@ -572,43 +623,35 @@ private[hive] case class HiveUDAFFunction( override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate = copy(inputAggBufferOffset = newInputAggBufferOffset) - @transient - private lazy val resolver = + private val resolver = if (isUDAFBridgeRequired) { new GenericUDAFBridge(funcWrapper.createFunction[UDAF]()) } else { funcWrapper.createFunction[AbstractGenericUDAFResolver]() } - @transient - private lazy val inspectors = children.map(toInspector).toArray + private val inspectors = exprs.map(toInspector).toArray - @transient - private lazy val functionAndInspector = { + private val function = { val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors, false, false) - val f = resolver.getEvaluator(parameterInfo) - f -> f.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors) + resolver.getEvaluator(parameterInfo) } - @transient - private lazy val function = functionAndInspector._1 - - @transient - private lazy val returnInspector = functionAndInspector._2 + private val returnInspector = function.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors) - @transient - private[this] var buffer: GenericUDAFEvaluator.AggregationBuffer = _ + private val buffer = + function.getNewAggregationBuffer override def eval(input: InternalRow): Any = unwrap(function.evaluate(buffer), returnInspector) @transient - private lazy val inputProjection = new InterpretedProjection(children) + val inputProjection = new InterpretedProjection(exprs) @transient - private lazy val cached = new Array[AnyRef](children.length) + protected lazy val cached = new Array[AnyRef](exprs.length) @transient - private lazy val inputDataTypes: Array[DataType] = children.map(_.dataType).toArray + private lazy val inputDataTypes: Array[DataType] = exprs.map(_.dataType).toArray // Hive UDAF has its own buffer, so we don't need to occupy a slot in the aggregation // buffer for it. diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala index 81ee9ba71beb6..80a61f82fd24f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala @@ -81,9 +81,9 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef test("Double create fails when allowExisting = false") { sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)") - intercept[QueryExecutionException] { + val message = intercept[QueryExecutionException] { sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)") - } + }.getMessage } test("Double create does not fail when allowExisting = true") { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index f74eb1500b989..f21e4964204e2 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -571,7 +571,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv Row(3) :: Row(4) :: Nil) table("test_parquet_ctas").queryExecution.optimizedPlan match { - case LogicalRelation(p: ParquetRelation, _) => // OK + case LogicalRelation(p: ParquetRelation) => // OK case _ => fail(s"test_parquet_ctas should have be converted to ${classOf[ParquetRelation]}") } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala index 5f9a447759b48..d9ba895e1eceb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala @@ -21,8 +21,7 @@ import java.io.{DataInput, DataOutput} import java.util.{ArrayList, Arrays, Properties} import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.hive.ql.udf.UDAFPercentile -import org.apache.hadoop.hive.ql.udf.generic.{GenericUDFOPAnd, GenericUDTFExplode, GenericUDAFAverage, GenericUDF} +import org.apache.hadoop.hive.ql.udf.generic.{GenericUDAFAverage, GenericUDF} import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorFactory} @@ -132,7 +131,7 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton { hiveContext.setConf(SQLConf.CODEGEN_ENABLED, codegenDefault) } - test("SPARK-6409 UDAF Average test") { + test("SPARK-6409 UDAFAverage test") { sql(s"CREATE TEMPORARY FUNCTION test_avg AS '${classOf[GenericUDAFAverage].getName}'") checkAnswer( sql("SELECT test_avg(1), test_avg(substr(value,5)) FROM src"), @@ -300,62 +299,6 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton { hiveContext.reset() } - - test("Hive UDFs with insufficient number of input arguments should trigger an analysis error") { - Seq((1, 2)).toDF("a", "b").registerTempTable("testUDF") - - { - // HiveSimpleUDF - sql(s"CREATE TEMPORARY FUNCTION testUDFTwoListList AS '${classOf[UDFTwoListList].getName}'") - val message = intercept[AnalysisException] { - sql("SELECT testUDFTwoListList() FROM testUDF") - }.getMessage - assert(message.contains("No handler for Hive udf")) - sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFTwoListList") - } - - { - // HiveGenericUDF - sql(s"CREATE TEMPORARY FUNCTION testUDFAnd AS '${classOf[GenericUDFOPAnd].getName}'") - val message = intercept[AnalysisException] { - sql("SELECT testUDFAnd() FROM testUDF") - }.getMessage - assert(message.contains("No handler for Hive udf")) - sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFAnd") - } - - { - // Hive UDAF - sql(s"CREATE TEMPORARY FUNCTION testUDAFPercentile AS '${classOf[UDAFPercentile].getName}'") - val message = intercept[AnalysisException] { - sql("SELECT testUDAFPercentile(a) FROM testUDF GROUP BY b") - }.getMessage - assert(message.contains("No handler for Hive udf")) - sql("DROP TEMPORARY FUNCTION IF EXISTS testUDAFPercentile") - } - - { - // AbstractGenericUDAFResolver - sql(s"CREATE TEMPORARY FUNCTION testUDAFAverage AS '${classOf[GenericUDAFAverage].getName}'") - val message = intercept[AnalysisException] { - sql("SELECT testUDAFAverage() FROM testUDF GROUP BY b") - }.getMessage - assert(message.contains("No handler for Hive udf")) - sql("DROP TEMPORARY FUNCTION IF EXISTS testUDAFAverage") - } - - { - // Hive UDTF - sql(s"CREATE TEMPORARY FUNCTION testUDTFExplode AS '${classOf[GenericUDTFExplode].getName}'") - val message = intercept[AnalysisException] { - sql("SELECT testUDTFExplode() FROM testUDF") - }.getMessage - assert(message.contains("No handler for Hive udf")) - sql("DROP TEMPORARY FUNCTION IF EXISTS testUDTFExplode") - } - - sqlContext.dropTempTable("testUDF") - } } class TestPair(x: Int, y: Int) extends Writable with Serializable { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index af48d478953b4..24c9794d169a2 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -268,7 +268,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { def checkRelation(tableName: String, isDataSourceParquet: Boolean): Unit = { val relation = EliminateSubQueries(catalog.lookupRelation(TableIdentifier(tableName))) relation match { - case LogicalRelation(r: ParquetRelation, _) => + case LogicalRelation(r: ParquetRelation) => if (!isDataSourceParquet) { fail( s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " + diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala index 905eb7a3925b2..784b9342280b5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala @@ -282,7 +282,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { ) table("test_parquet_ctas").queryExecution.optimizedPlan match { - case LogicalRelation(_: ParquetRelation, _) => // OK + case LogicalRelation(_: ParquetRelation) => // OK case _ => fail( "test_parquet_ctas should be converted to " + s"${classOf[ParquetRelation].getCanonicalName }") @@ -369,7 +369,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { assertResult(2) { analyzed.collect { - case r @ LogicalRelation(_: ParquetRelation, _) => r + case r@LogicalRelation(_: ParquetRelation) => r }.size } } @@ -378,7 +378,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { def collectParquetRelation(df: DataFrame): ParquetRelation = { val plan = df.queryExecution.analyzed plan.collectFirst { - case LogicalRelation(r: ParquetRelation, _) => r + case LogicalRelation(r: ParquetRelation) => r }.getOrElse { fail(s"Expecting a ParquetRelation2, but got:\n$plan") } @@ -428,7 +428,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { // Converted test_parquet should be cached. catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) match { case null => fail("Converted test_parquet should be cached in the cache.") - case logical @ LogicalRelation(parquetRelation: ParquetRelation, _) => // OK + case logical @ LogicalRelation(parquetRelation: ParquetRelation) => // OK case other => fail( "The cached test_parquet should be a Parquet Relation. " + @@ -620,7 +620,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest { val conf = Seq( HiveContext.CONVERT_METASTORE_PARQUET.key -> "false", SQLConf.PARQUET_BINARY_AS_STRING.key -> "true", - SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "false") + SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key -> "true") withSQLConf(conf: _*) { sql( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala index 100b97137cff0..8a36937e95989 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala @@ -500,7 +500,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes } val actualPaths = df.queryExecution.analyzed.collectFirst { - case LogicalRelation(relation: HadoopFsRelation, _) => + case LogicalRelation(relation: HadoopFsRelation) => relation.paths.toSet }.getOrElse { fail("Expect an FSBasedRelation, but none could be found") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala index 7829f5e887995..3c30eb82b4e3a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala @@ -38,7 +38,9 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { def start(time: Time) { this.synchronized { - require(zeroTime == null, "DStream graph computation already started") + if (zeroTime != null) { + throw new Exception("DStream graph computation already started") + } zeroTime = time startTime = time outputStreams.foreach(_.initialize(zeroTime)) @@ -66,16 +68,20 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { def setBatchDuration(duration: Duration) { this.synchronized { - require(batchDuration == null, - s"Batch duration already set as $batchDuration. Cannot set it again.") + if (batchDuration != null) { + throw new Exception("Batch duration already set as " + batchDuration + + ". cannot set it again.") + } batchDuration = duration } } def remember(duration: Duration) { this.synchronized { - require(rememberDuration == null, - s"Remember duration already set as $rememberDuration. Cannot set it again.") + if (rememberDuration != null) { + throw new Exception("Remember duration already set as " + batchDuration + + ". cannot set it again.") + } rememberDuration = duration } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala index 2480b4ec093e2..ccc8f7c9eeec9 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala @@ -183,9 +183,21 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { listenerBus.post(StreamingListenerBatchCompleted(jobSet.toBatchInfo)) } job.result match { + case Success(_) => + val jobSet = jobSets.get(job.time) + jobSet.handleJobCompletion(job) + logInfo("Finished job " + job.id + " from job set of time " + jobSet.time) + if (jobSet.hasCompleted) { + jobSets.remove(jobSet.time) + jobGenerator.onBatchCompletion(jobSet.time) + logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format( + jobSet.totalDelay / 1000.0, jobSet.time.toString, + jobSet.processingDelay / 1000.0 + )) + listenerBus.post(StreamingListenerBatchCompleted(jobSet.toBatchInfo)) + } case Failure(e) => reportError("Error running job " + job, e) - case _ => } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala index f76300351e3c0..5fb5555f95f42 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala @@ -18,10 +18,8 @@ package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet -import scala.util.Failure import org.apache.spark.streaming.Time -import org.apache.spark.util.Utils /** Class representing a set of Jobs * belong to the same batch. diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java index 3ced2094f5e6b..c08c9c73d2396 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java @@ -19,11 +19,7 @@ import org.apache.spark.unsafe.Platform; -import java.util.Arrays; - -public final class ByteArray { - - public static final byte[] EMPTY_BYTE = new byte[0]; +public class ByteArray { /** * Writes the content of a byte array into a memory address, identified by an object and an @@ -33,45 +29,4 @@ public final class ByteArray { public static void writeToMemory(byte[] src, Object target, long targetOffset) { Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET, target, targetOffset, src.length); } - - /** - * Returns a 64-bit integer that can be used as the prefix used in sorting. - */ - public static long getPrefix(byte[] bytes) { - if (bytes == null) { - return 0L; - } else { - final int minLen = Math.min(bytes.length, 8); - long p = 0; - for (int i = 0; i < minLen; ++i) { - p |= (128L + Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i)) - << (56 - 8 * i); - } - return p; - } - } - - public static byte[] subStringSQL(byte[] bytes, int pos, int len) { - // This pos calculation is according to UTF8String#subStringSQL - if (pos > bytes.length) { - return EMPTY_BYTE; - } - int start = 0; - int end; - if (pos > 0) { - start = pos - 1; - } else if (pos < 0) { - start = bytes.length + pos; - } - if ((bytes.length - start) < len) { - end = bytes.length; - } else { - end = start + len; - } - start = Math.max(start, 0); // underflow - if (start >= end) { - return EMPTY_BYTE; - } - return Arrays.copyOfRange(bytes, start, end); - } } diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 50ae7ffeec4c5..f190669df9565 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -362,7 +362,7 @@ private[spark] class ApplicationMaster( if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) { finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES, - s"Max number of executor failures ($maxNumExecutorFailures) reached") + "Max number of executor failures reached") } else { logDebug("Sending progress") allocator.allocateResources() diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala index 1165061db21e3..54f62e6b723ac 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala @@ -81,7 +81,25 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) .orNull // If dynamic allocation is enabled, start at the configured initial number of executors. // Default to minExecutors if no initialExecutors is set. - numExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sparkConf) + if (isDynamicAllocationEnabled) { + val minExecutorsConf = "spark.dynamicAllocation.minExecutors" + val initialExecutorsConf = "spark.dynamicAllocation.initialExecutors" + val maxExecutorsConf = "spark.dynamicAllocation.maxExecutors" + val minNumExecutors = sparkConf.getInt(minExecutorsConf, 0) + val initialNumExecutors = sparkConf.getInt(initialExecutorsConf, minNumExecutors) + val maxNumExecutors = sparkConf.getInt(maxExecutorsConf, Integer.MAX_VALUE) + + // If defined, initial executors must be between min and max + if (initialNumExecutors < minNumExecutors || initialNumExecutors > maxNumExecutors) { + throw new IllegalArgumentException( + s"$initialExecutorsConf must be between $minExecutorsConf and $maxNumExecutors!") + } + + numExecutors = initialNumExecutors + } else { + val numExecutorsConf = "spark.executor.instances" + numExecutors = sparkConf.getInt(numExecutorsConf, numExecutors) + } principal = Option(principal) .orElse(sparkConf.getOption("spark.yarn.principal")) .orNull diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index 4d9e777cb4134..79a321c89a27a 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -89,7 +89,11 @@ private[yarn] class YarnAllocator( @volatile private var numExecutorsFailed = 0 @volatile private var targetNumExecutors = - YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sparkConf) + if (Utils.isDynamicAllocationEnabled(sparkConf)) { + sparkConf.getInt("spark.dynamicAllocation.initialExecutors", 0) + } else { + sparkConf.getInt("spark.executor.instances", YarnSparkHadoopUtil.DEFAULT_NUMBER_EXECUTORS) + } // Executor loss reason requests that are pending - maps from executor ID for inquiry to a // list of requesters that should be responded to once we find out why the given executor diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala index 561ad79ee0228..06984f7018d43 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala @@ -388,28 +388,5 @@ object YarnSparkHadoopUtil { def getClassPathSeparator(): String = { classPathSeparatorField.get(null).asInstanceOf[String] } - - /** - * Getting the initial target number of executors depends on whether dynamic allocation is - * enabled. - */ - def getInitialTargetExecutorNumber(conf: SparkConf): Int = { - if (Utils.isDynamicAllocationEnabled(conf)) { - val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", 0) - val initialNumExecutors = - conf.getInt("spark.dynamicAllocation.initialExecutors", minNumExecutors) - val maxNumExecutors = conf.getInt("spark.dynamicAllocation.maxExecutors", Int.MaxValue) - require(initialNumExecutors >= minNumExecutors && initialNumExecutors <= maxNumExecutors, - s"initial executor number $initialNumExecutors must between min executor number" + - s"$minNumExecutors and max executor number $maxNumExecutors") - - initialNumExecutors - } else { - val targetNumExecutors = - sys.env.get("SPARK_EXECUTOR_INSTANCES").map(_.toInt).getOrElse(DEFAULT_NUMBER_EXECUTORS) - // System property can override environment variable. - conf.getInt("spark.executor.instances", targetNumExecutors) - } - } } diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala index 50b699f11b21c..1aed5a1675075 100644 --- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala +++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala @@ -17,13 +17,21 @@ package org.apache.spark.scheduler.cluster +import java.net.NetworkInterface + import org.apache.hadoop.yarn.api.ApplicationConstants.Environment + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.yarn.api.records.NodeState +import org.apache.hadoop.yarn.client.api.YarnClient import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.spark.SparkContext import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil +import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._ import org.apache.spark.scheduler.TaskSchedulerImpl -import org.apache.spark.util.Utils +import org.apache.spark.util.{IntParam, Utils} private[spark] class YarnClusterSchedulerBackend( scheduler: TaskSchedulerImpl, @@ -32,7 +40,13 @@ private[spark] class YarnClusterSchedulerBackend( override def start() { super.start() - totalExpectedExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sc.conf) + totalExpectedExecutors = DEFAULT_NUMBER_EXECUTORS + if (System.getenv("SPARK_EXECUTOR_INSTANCES") != null) { + totalExpectedExecutors = IntParam.unapply(System.getenv("SPARK_EXECUTOR_INSTANCES")) + .getOrElse(totalExpectedExecutors) + } + // System property can override environment variable. + totalExpectedExecutors = sc.getConf.getInt("spark.executor.instances", totalExpectedExecutors) } override def applicationId(): String = diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala index a70e66d39a64e..bfeae111b9a04 100644 --- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala +++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala @@ -34,7 +34,6 @@ import org.scalatest.Matchers import org.apache.hadoop.yarn.api.records.ApplicationAccessType import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException, SparkFunSuite} -import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.util.Utils