Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
logloss -> logLoss
  • Loading branch information
zhengruifeng committed Oct 16, 2019
commit a981f7b0ce459e5935548f499feb50034f06d756
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import org.apache.spark.sql.types.DoubleType

/**
* Evaluator for multiclass classification, which expects input columns: prediction, label,
* weight(optional) and probabilityCol(only for log-loss).
* weight(optional) and probability(only for logLoss).
*/
@Since("1.5.0")
class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") override val uid: String)
Expand All @@ -46,7 +46,7 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
* `"weightedPrecision"`, `"weightedRecall"`, `"weightedTruePositiveRate"`,
* `"weightedFalsePositiveRate"`, `"weightedFMeasure"`, `"truePositiveRateByLabel"`,
* `"falsePositiveRateByLabel"`, `"precisionByLabel"`, `"recallByLabel"`,
* `"fMeasureByLabel"`, `"logloss"`)
* `"fMeasureByLabel"`, `"logLoss"`)
*
* @group param
*/
Expand Down Expand Up @@ -116,7 +116,7 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid

@Since("3.0.0")
final val eps: DoubleParam = new DoubleParam(this, "eps",
"Log loss is undefined for p=0 or p=1, so probabilities are clipped to " +
"LogLoss is undefined for p=0 or p=1, so probabilities are clipped to " +
"max(eps, min(1 - eps, p)).",
ParamValidators.inRange(0, 0.5, false, false))

Expand All @@ -142,7 +142,7 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
lit(1.0)
}

val rdd = if ($(metricName) == "logloss") {
val rdd = if ($(metricName) == "logLoss") {
// probabilityCol is only needed to compute logloss
require(isDefined(probabilityCol) && $(probabilityCol).nonEmpty)
val p = DatasetUtils.columnToVector(dataset, $(probabilityCol))
Expand Down Expand Up @@ -172,7 +172,7 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
case "precisionByLabel" => metrics.precision($(metricLabel))
case "recallByLabel" => metrics.recall($(metricLabel))
case "fMeasureByLabel" => metrics.fMeasure($(metricLabel), $(beta))
case "logloss" => metrics.logloss($(eps))
case "logLoss" => metrics.logLoss($(eps))
}
}

Expand All @@ -193,7 +193,7 @@ object MulticlassClassificationEvaluator
private val supportedMetricNames = Array("f1", "accuracy", "weightedPrecision", "weightedRecall",
"weightedTruePositiveRate", "weightedFalsePositiveRate", "weightedFMeasure",
"truePositiveRateByLabel", "falsePositiveRateByLabel", "precisionByLabel", "recallByLabel",
"fMeasureByLabel", "logloss")
"fMeasureByLabel", "logLoss")

@Since("1.6.0")
override def load(path: String): MulticlassClassificationEvaluator = super.load(path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import scala.collection.mutable
import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{Matrices, Matrix}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.DataFrame

/**
* Evaluator for multiclass classification.
Expand All @@ -37,7 +37,7 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[_ <: Product])
/**
* An auxiliary constructor taking a DataFrame.
* @param predictionAndLabels a DataFrame with columns: prediction, label, weight(optional)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: spaces before paren

* and probability(only for logloss)
* and probability(only for logLoss)
*/
private[mllib] def this(predictionAndLabels: DataFrame) =
this(predictionAndLabels.rdd.map { r =>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

matching will not work in pyspark, so I have to use r.get instead.
MultilabelMetrics also deals with dataframe in this way.

Expand Down Expand Up @@ -241,12 +241,12 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[_ <: Product])
lazy val labels: Array[Double] = tpByClass.keys.toArray.sorted

/**
* Returns the log-loss, aka logistic loss or cross-entropy loss.
* @param eps Log loss is undefined for p=0 or p=1, so probabilities are
* Returns the logLoss, aka logistic loss or cross-entropy loss.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could just use a @return tag
Also log-loss rather than logLoss

* @param eps LogLoss is undefined for p=0 or p=1, so probabilities are
* clipped to max(eps, min(1 - eps, p)).
*/
@Since("3.0.0")
def logloss(eps: Double = 1e-15): Double = {
def logLoss(eps: Double = 1e-15): Double = {
require(eps > 0 && eps < 0.5, s"eps must be in range (0, 0.5), but got $eps")
val loss1 = - math.log(eps)
val loss2 = - math.log(1 - eps)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

- math.log1p(-eps)? because eps is going to be very small

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class MulticlassClassificationEvaluatorSuite
}.toDF("prediction", "label", "probability")

val evaluator = new MulticlassClassificationEvaluator()
.setMetricName("logloss")
.setMetricName("logLoss")
assert(evaluator.evaluate(df) ~== 0.9682005730687164 absTol 1e-5)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -206,14 +206,14 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
(prediction, label, weight, probability)
}
val metrics = new MulticlassMetrics(rdd)
assert(metrics.logloss() ~== 0.16145936283256573 relTol delta)
assert(metrics.logLoss() ~== 0.16145936283256573 relTol delta)

val rdd2 = rdd.map {
case (prediction: Double, label: Double, weight: Double, probability: Array[Double]) =>
(prediction, label, 1.0, probability)
}
val metrics2 = new MulticlassMetrics(rdd2)
assert(metrics2.logloss() ~== 0.21616187468057912 relTol delta)
assert(metrics2.logLoss() ~== 0.21616187468057912 relTol delta)
}

test("MulticlassMetrics supports multi-class log-loss") {
Expand Down Expand Up @@ -245,13 +245,13 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
(prediction, label, weight, probability)
}
val metrics = new MulticlassMetrics(rdd)
assert(metrics.logloss() ~== 1.3529429766879466 relTol delta)
assert(metrics.logLoss() ~== 1.3529429766879466 relTol delta)

val rdd2 = rdd.map {
case (prediction: Double, label: Double, weight: Double, probability: Array[Double]) =>
(prediction, label, 1.0, probability)
}
val metrics2 = new MulticlassMetrics(rdd2)
assert(metrics2.logloss() ~== 0.9682005730687164 relTol delta)
assert(metrics2.logLoss() ~== 0.9682005730687164 relTol delta)
}
}
8 changes: 4 additions & 4 deletions python/pyspark/ml/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
HasProbabilityCol, JavaMLReadable, JavaMLWritable):
"""
Evaluator for Multiclass Classification, which expects input
columns: prediction, label, weight(optional) and probabilityCol(only for log-loss).
columns: prediction, label, weight(optional) and probabilityCol(only for logLoss).

>>> scoreAndLabels = [(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]
Expand Down Expand Up @@ -352,7 +352,7 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
... "label", "weight", "probability"])
>>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
... probabilityCol="probability")
>>> evaluator.setMetricName("logloss")
>>> evaluator.setMetricName("logLoss")
MulticlassClassificationEvaluator...
>>> evaluator.evaluate(dataset)
0.9682...
Expand All @@ -364,7 +364,7 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
"(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate|"
"weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel|"
"falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel|"
"logloss)",
"logLoss)",
typeConverter=TypeConverters.toString)
metricLabel = Param(Params._dummy(), "metricLabel",
"The class whose metric will be computed in truePositiveRateByLabel|"
Expand All @@ -376,7 +376,7 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
" Must be > 0. The default value is 1.",
typeConverter=TypeConverters.toFloat)
eps = Param(Params._dummy(), "eps",
"Log loss is undefined for p=0 or p=1, so probabilities are clipped to "
"LogLoss is undefined for p=0 or p=1, so probabilities are clipped to "
"max(eps, min(1 - eps, p)). "
"Must be in range (0, 0.5). The default value is 1e-15.",
typeConverter=TypeConverters.toFloat)
Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/mllib/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ class MulticlassMetrics(JavaModelWrapper):
... (1.0, 1.0, 1.0, [0.1, 0.8, 0.1]), (0.0, 2.0, 1.0, [0.9, 0.05, 0.05]),
... (0.0, 0.0, 1.0, [0.8, 0.2, 0.0]), (1.0, 1.0, 1.0, [0.3, 0.65, 0.05])])
>>> metrics = MulticlassMetrics(predictionAndLabelsWithProbabilities)
>>> metrics.logloss()
>>> metrics.logLoss()
0.9682...

.. versionadded:: 1.4.0
Expand Down Expand Up @@ -366,11 +366,11 @@ def weightedFMeasure(self, beta=None):
return self.call("weightedFMeasure", beta)

@since('3.0.0')
def logloss(self, eps=1e-15):
def logLoss(self, eps=1e-15):
"""
Returns weighted log-loss.
Returns weighted logLoss.
"""
return self.call("logloss", eps)
return self.call("logLoss", eps)


class RankingMetrics(JavaModelWrapper):
Expand Down