Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Added fixes from PR notes. Fixed style test failures.
  • Loading branch information
noel-smith committed Aug 27, 2015
commit bada4539227a3705337beea7e08bdc45183e2903
6 changes: 6 additions & 0 deletions python/pyspark/ml/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ def evaluate(self, dataset, params=None):
raise ValueError("Params must be a param map but got %s." % type(params))

def isLargerBetter(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please copy the doc from Scala here? (no need to copy to child classes since the "inherit_doc" tag will handle that)

"""
Indicates whether the metric returned by :py:meth:`evaluate` should be maximized
(True, default) or minimized (False).
A given evaluator may support multiple metrics which may be maximized or minimized.
"""
return True


Expand All @@ -89,6 +94,7 @@ def _evaluate(self, dataset):
return self._java_obj.evaluate(dataset._jdf)

def isLargerBetter(self):
self._transfer_params_to_java()
return self._java_obj.isLargerBetter()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This works for current use since it's called after evaluate has been called, but it could fail if a user calls it since the params may not have been transferred to java yet. Call self._transfer_params_to_java() first.



Expand Down
37 changes: 21 additions & 16 deletions python/pyspark/ml/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,25 +267,27 @@ def test_ngram(self):
self.assertEquals(transformedDF.head().output, ["a b c d", "b c d e"])



class HasInducedError(Params):

def __init__(self):
super(HasInducedError, self).__init__()
self.inducedError = Param(self, "inducedError", "Uniformly-distributed error added to feature")
self.inducedError = Param(self, "inducedError",
"Uniformly-distributed error added to feature")

def getInducedError(self):
return self.getOrDefault(self.inducedError)


class InducedErrorModel(Model, HasInducedError):

def __init__(self):
super(InducedErrorModel, self).__init__()

def _transform(self, dataset):
return dataset.withColumn("prediction",
return dataset.withColumn("prediction",
dataset.feature + (rand(0) * self.getInducedError()))


class InducedErrorEstimator(Estimator, HasInducedError):

def __init__(self, inducedError=1.0):
Expand All @@ -297,52 +299,55 @@ def _fit(self, dataset):
self._copyValues(model)
return model


class CrossValidatorTests(PySparkTestCase):

def test_fit_minimize_metric(self):
sqlContext = SQLContext(self.sc)
dataset = sqlContext.createDataFrame([
(10, 10.0),
(50, 50.0),
(100, 100.0),
(10, 10.0),
(50, 50.0),
(100, 100.0),
(500, 500.0)] * 10,
["feature", "label"])

iee = InducedErrorEstimator()
evaluator = RegressionEvaluator(metricName="rmse")

grid = (ParamGridBuilder()
.addGrid( iee.inducedError, [100.0, 0.0, 10000.0] )
.build())
.addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
.build())
cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
bestModel = cvModel.bestModel
bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), "Best model should have zero induced error")
self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
"Best model should have zero induced error")
self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")

def test_fit_maximize_metric(self):
sqlContext = SQLContext(self.sc)
dataset = sqlContext.createDataFrame([
(10, 10.0),
(50, 50.0),
(100, 100.0),
(10, 10.0),
(50, 50.0),
(100, 100.0),
(500, 500.0)] * 10,
["feature", "label"])

iee = InducedErrorEstimator()
evaluator = RegressionEvaluator(metricName="r2")

grid = (ParamGridBuilder()
.addGrid( iee.inducedError, [100.0, 0.0, 10000.0] )
.build())
.addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
.build())
cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
bestModel = cvModel.bestModel
bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), "Best model should have zero induced error")
self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
"Best model should have zero induced error")
self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")


Expand Down