Skip to content

Commit 8f641ac

Browse files
committed
Merge pull request #7 from apache/master
merge upstream changes
2 parents 4e98236 + e537b33 commit 8f641ac

File tree

6 files changed

+100
-10
lines changed

6 files changed

+100
-10
lines changed

mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ import org.apache.spark.util.Utils
3232
* :: Experimental ::
3333
* Maps a sequence of terms to their term frequencies using the hashing trick.
3434
*
35-
* @param numFeatures number of features (default: 1000000)
35+
* @param numFeatures number of features (default: 2^20^)
3636
*/
3737
@Experimental
3838
class HashingTF(val numFeatures: Int) extends Serializable {
3939

40-
def this() = this(1000000)
40+
def this() = this(1 << 20)
4141

4242
/**
4343
* Returns the index of the input term.

mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ package org.apache.spark.mllib.feature
1919

2020
import breeze.linalg.{DenseVector => BDV, SparseVector => BSV}
2121

22-
import org.apache.spark.annotation.DeveloperApi
22+
import org.apache.spark.annotation.Experimental
2323
import org.apache.spark.mllib.linalg.{Vector, Vectors}
2424

2525
/**
26-
* :: DeveloperApi ::
26+
* :: Experimental ::
2727
* Normalizes samples individually to unit L^p^ norm
2828
*
2929
* For any 1 <= p < Double.PositiveInfinity, normalizes samples using
@@ -33,7 +33,7 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors}
3333
*
3434
* @param p Normalization in L^p^ space, p = 2 by default.
3535
*/
36-
@DeveloperApi
36+
@Experimental
3737
class Normalizer(p: Double) extends VectorTransformer {
3838

3939
def this() = this(2)

mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,22 @@ package org.apache.spark.mllib.feature
1919

2020
import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
2121

22-
import org.apache.spark.annotation.DeveloperApi
22+
import org.apache.spark.annotation.Experimental
2323
import org.apache.spark.mllib.linalg.{Vector, Vectors}
2424
import org.apache.spark.mllib.rdd.RDDFunctions._
2525
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
2626
import org.apache.spark.rdd.RDD
2727

2828
/**
29-
* :: DeveloperApi ::
29+
* :: Experimental ::
3030
* Standardizes features by removing the mean and scaling to unit variance using column summary
3131
* statistics on the samples in the training set.
3232
*
3333
* @param withMean False by default. Centers the data with mean before scaling. It will build a
3434
* dense output, so this does not work on sparse input and will raise an exception.
3535
* @param withStd True by default. Scales the data to unit standard deviation.
3636
*/
37-
@DeveloperApi
37+
@Experimental
3838
class StandardScaler(withMean: Boolean, withStd: Boolean) extends VectorTransformer {
3939

4040
def this() = this(false, true)

mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717

1818
package org.apache.spark.mllib.feature
1919

20+
import java.lang.{Iterable => JavaIterable}
21+
22+
import scala.collection.JavaConverters._
2023
import scala.collection.mutable
2124
import scala.collection.mutable.ArrayBuffer
2225

@@ -25,6 +28,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}
2528
import org.apache.spark.Logging
2629
import org.apache.spark.SparkContext._
2730
import org.apache.spark.annotation.Experimental
31+
import org.apache.spark.api.java.JavaRDD
2832
import org.apache.spark.mllib.linalg.{Vector, Vectors}
2933
import org.apache.spark.mllib.rdd.RDDFunctions._
3034
import org.apache.spark.rdd._
@@ -239,7 +243,7 @@ class Word2Vec extends Serializable with Logging {
239243
a += 1
240244
}
241245
}
242-
246+
243247
/**
244248
* Computes the vector representation of each word in vocabulary.
245249
* @param dataset an RDD of words
@@ -369,11 +373,22 @@ class Word2Vec extends Serializable with Logging {
369373

370374
new Word2VecModel(word2VecMap.toMap)
371375
}
376+
377+
/**
378+
* Computes the vector representation of each word in vocabulary (Java version).
379+
* @param dataset a JavaRDD of words
380+
* @return a Word2VecModel
381+
*/
382+
def fit[S <: JavaIterable[String]](dataset: JavaRDD[S]): Word2VecModel = {
383+
fit(dataset.rdd.map(_.asScala))
384+
}
372385
}
373386

374387
/**
375-
* Word2Vec model
388+
* :: Experimental ::
389+
* Word2Vec model
376390
*/
391+
@Experimental
377392
class Word2VecModel private[mllib] (
378393
private val model: Map[String, Array[Float]]) extends Serializable {
379394

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.mllib.feature;
19+
20+
import java.io.Serializable;
21+
import java.util.List;
22+
23+
import scala.Tuple2;
24+
25+
import com.google.common.collect.Lists;
26+
import com.google.common.base.Strings;
27+
import org.junit.After;
28+
import org.junit.Assert;
29+
import org.junit.Before;
30+
import org.junit.Test;
31+
32+
import org.apache.spark.api.java.JavaRDD;
33+
import org.apache.spark.api.java.JavaSparkContext;
34+
35+
public class JavaWord2VecSuite implements Serializable {
36+
private transient JavaSparkContext sc;
37+
38+
@Before
39+
public void setUp() {
40+
sc = new JavaSparkContext("local", "JavaWord2VecSuite");
41+
}
42+
43+
@After
44+
public void tearDown() {
45+
sc.stop();
46+
sc = null;
47+
}
48+
49+
@Test
50+
@SuppressWarnings("unchecked")
51+
public void word2Vec() {
52+
// The tests are to check Java compatibility.
53+
String sentence = Strings.repeat("a b ", 100) + Strings.repeat("a c ", 10);
54+
List<String> words = Lists.newArrayList(sentence.split(" "));
55+
List<List<String>> localDoc = Lists.newArrayList(words, words);
56+
JavaRDD<List<String>> doc = sc.parallelize(localDoc);
57+
Word2Vec word2vec = new Word2Vec()
58+
.setVectorSize(10)
59+
.setSeed(42L);
60+
Word2VecModel model = word2vec.fit(doc);
61+
Tuple2<String, Object>[] syms = model.findSynonyms("a", 2);
62+
Assert.assertEquals(2, syms.length);
63+
Assert.assertEquals("b", syms[0]._1());
64+
Assert.assertEquals("c", syms[1]._1());
65+
}
66+
}

python/pyspark/rdd.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ class MaxHeapQ(object):
134134

135135
"""
136136
An implementation of MaxHeap.
137+
137138
>>> import pyspark.rdd
138139
>>> heap = pyspark.rdd.MaxHeapQ(5)
139140
>>> [heap.insert(i) for i in range(10)]
@@ -381,6 +382,7 @@ def mapPartitionsWithSplit(self, f, preservesPartitioning=False):
381382
def getNumPartitions(self):
382383
"""
383384
Returns the number of partitions in RDD
385+
384386
>>> rdd = sc.parallelize([1, 2, 3, 4], 2)
385387
>>> rdd.getNumPartitions()
386388
2
@@ -570,6 +572,7 @@ def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):
570572
"""
571573
Sorts this RDD, which is assumed to consist of (key, value) pairs.
572574
# noqa
575+
573576
>>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
574577
>>> sc.parallelize(tmp).sortByKey(True, 2).collect()
575578
[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
@@ -1209,6 +1212,7 @@ def collectAsMap(self):
12091212
def keys(self):
12101213
"""
12111214
Return an RDD with the keys of each tuple.
1215+
12121216
>>> m = sc.parallelize([(1, 2), (3, 4)]).keys()
12131217
>>> m.collect()
12141218
[1, 3]
@@ -1218,6 +1222,7 @@ def keys(self):
12181222
def values(self):
12191223
"""
12201224
Return an RDD with the values of each tuple.
1225+
12211226
>>> m = sc.parallelize([(1, 2), (3, 4)]).values()
12221227
>>> m.collect()
12231228
[2, 4]
@@ -1642,6 +1647,7 @@ def repartition(self, numPartitions):
16421647
Internally, this uses a shuffle to redistribute data.
16431648
If you are decreasing the number of partitions in this RDD, consider
16441649
using `coalesce`, which can avoid performing a shuffle.
1650+
16451651
>>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4)
16461652
>>> sorted(rdd.glom().collect())
16471653
[[1], [2, 3], [4, 5], [6, 7]]
@@ -1656,6 +1662,7 @@ def repartition(self, numPartitions):
16561662
def coalesce(self, numPartitions, shuffle=False):
16571663
"""
16581664
Return a new RDD that is reduced into `numPartitions` partitions.
1665+
16591666
>>> sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect()
16601667
[[1], [2, 3], [4, 5]]
16611668
>>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect()
@@ -1694,6 +1701,7 @@ def name(self):
16941701
def setName(self, name):
16951702
"""
16961703
Assign a name to this RDD.
1704+
16971705
>>> rdd1 = sc.parallelize([1,2])
16981706
>>> rdd1.setName('RDD1')
16991707
>>> rdd1.name()
@@ -1753,6 +1761,7 @@ class PipelinedRDD(RDD):
17531761

17541762
"""
17551763
Pipelined maps:
1764+
17561765
>>> rdd = sc.parallelize([1, 2, 3, 4])
17571766
>>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect()
17581767
[4, 8, 12, 16]

0 commit comments

Comments
 (0)