Skip to content
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update AIC calculation
  • Loading branch information
yanboliang committed Mar 14, 2016
commit 5d4c87b9ba9a88f2f8aee985e5f918bf02600ae9
Original file line number Diff line number Diff line change
Expand Up @@ -339,13 +339,12 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
/** The variance of the endogenous variable's mean, given the value mu. */
def variance(mu: Double): Double

/**
* Deviance of (y, mu) pair.
* Deviance is usually defined as twice the loglikelihood ratio.
*/
/** Deviance of (y, mu) pair. */
def deviance(y: Double, mu: Double, weight: Double): Double

def aic(predictions: RDD[(Double, Double, Double)], deviance: Double, weightSum: Double): Double
/** Akaike's 'An Information Criterion'(AIC) value of the family. */
def aic(predictions: RDD[(Double, Double, Double)], deviance: Double,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • document params, especially predictions (y, mu, weight)
  • chop down args

numInstances: Double, weightSum: Double): Double

/** Trim the fitted value so that it will be in valid range. */
def project(mu: Double): Double = mu
Expand Down Expand Up @@ -383,10 +382,13 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
weight * math.pow(y - mu, 2.0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(y - mu) * (y - mu), which is faster

}

override def aic(predictions: RDD[(Double, Double, Double)], deviance: Double, weightSum: Double): Double = {
val nobs = predictions.count()
override def aic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double = {
val wt = predictions.map(x => math.log(x._3)).sum()
nobs * (math.log(deviance / nobs * 2.0 * 3.141593) + 1.0) + 2.0 - wt
numInstances * (math.log(deviance / numInstances * 2.0 * math.Pi) + 1.0) + 2.0 - wt
}

override def project(mu: Double): Double = {
Expand Down Expand Up @@ -419,13 +421,18 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine

override def deviance(y: Double, mu: Double, weight: Double): Double = {
val my = 1.0 - y
2.0 * weight * (y * math.log(math.max(y, 1.0) / mu) + my * math.log(math.max(my, 1.0) / (1.0 - mu)))
2.0 * weight * (y * math.log(math.max(y, 1.0) / mu) +
my * math.log(math.max(my, 1.0) / (1.0 - mu)))
}

override def aic(predictions: RDD[(Double, Double, Double)], deviance: Double, weightSum: Double): Double = {
predictions.map { case (y: Double, mu: Double, weight: Double) =>
override def aic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double = {
-2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) =>
weight * breeze.stats.distributions.Binomial(1, mu).logProbabilityOf(math.round(y).toInt)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import distributions as dist and use dist.Binomial here?

}.sum() * (-2.0)
}.sum()
}

override def project(mu: Double): Double = {
Expand Down Expand Up @@ -459,10 +466,14 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
2 * weight * (y * math.log(y / mu) - (y - mu))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2.0 (just to be consistent)

}

override def aic(predictions: RDD[(Double, Double, Double)], deviance: Double, weightSum: Double): Double = {
predictions.map { case (y: Double, mu: Double, weight: Double) =>
override def aic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double = {
-2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) =>
weight * breeze.stats.distributions.Poisson(mu).logProbabilityOf(y.toInt)
}.sum() * (-2.0)
}.sum()
}

override def project(mu: Double): Double = {
Expand Down Expand Up @@ -497,11 +508,15 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
-2.0 * weight * (math.log(x) - (y - mu)/mu)
}

override def aic(predictions: RDD[(Double, Double, Double)], deviance: Double, weightSum: Double): Double = {
override def aic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double = {
val disp = deviance / weightSum
predictions.map { case (y: Double, mu: Double, weight: Double) =>
-2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) =>
weight * breeze.stats.distributions.Gamma(1.0 / disp, mu * disp).logPdf(y)
}.sum() * (-2.0) + 2.0
}.sum() + 2.0
}

override def project(mu: Double): Double = {
Expand Down Expand Up @@ -647,6 +662,11 @@ class GeneralizedLinearRegressionModel private[ml] (

private var trainingSummary: Option[GeneralizedLinearRegressionSummary] = None

/**
* Gets R-like summary of model on training set. An exception is
* thrown if `trainingSummary == None`.
*/
@Since("2.0.0")
def summary: GeneralizedLinearRegressionSummary = trainingSummary match {
case Some(summ) => summ
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trainingSummary.getOrElse {
  throw new Exception(...)
}

case None =>
Expand Down Expand Up @@ -789,32 +809,27 @@ class GeneralizedLinearRegressionSummary private[regression] (
private lazy val devianceResiduals: DataFrame = {
val drUDF = udf { (y: Double, mu: Double, weight: Double) =>
val r = math.sqrt(math.max(family.deviance(y, mu, weight), 0.0))
if (y > mu) {
r
} else {
-1.0 * r
}
if (y > mu) r else -1.0 * r
}
val w = if (model.getWeightCol.isEmpty) lit(1.0) else col(model.getWeightCol)
predictions.select(drUDF(col(model.getLabelCol), col(predictionCol), w).as("devianceResiduals"))
predictions.select(
drUDF(col(model.getLabelCol), col(predictionCol), w).as("devianceResiduals"))
}

private lazy val pearsonResiduals: DataFrame = {
val prUDF = udf { mu: Double => family.variance(mu) }
val w = if (model.getWeightCol.isEmpty) lit(1.0) else col(model.getWeightCol)
predictions.select(col(model.getLabelCol).minus(col(model.getPredictionCol))
.multiply(sqrt(w)).divide(sqrt(prUDF(col(model.getPredictionCol)))).as("pearsonResiduals"))
predictions.select(col(model.getLabelCol).minus(col(predictionCol))
.multiply(sqrt(w)).divide(sqrt(prUDF(col(predictionCol)))).as("pearsonResiduals"))
}

private lazy val workingResiduals: DataFrame = {
val wrUDF = udf { (y: Double, mu: Double) =>
(y - mu) * link.deriv(mu)
}
val wrUDF = udf { (y: Double, mu: Double) => (y - mu) * link.deriv(mu) }
predictions.select(wrUDF(col(model.getLabelCol), col(predictionCol)).as("workingResiduals"))
}

private lazy val responseResiduals: DataFrame = {
predictions.select(col(model.getLabelCol).minus(col(model.getPredictionCol)).as("responseResiduals"))
predictions.select(col(model.getLabelCol).minus(col(predictionCol)).as("responseResiduals"))
}

/**
Expand All @@ -829,7 +844,7 @@ class GeneralizedLinearRegressionSummary private[regression] (
case "working" => workingResiduals
case "response" => responseResiduals
case other => throw new UnsupportedOperationException(
s"The residuals type $other is not supported by Generialized Linear Regression.")
s"The residuals type $other is not supported by Generalized Linear Regression.")
}
}

Expand All @@ -838,15 +853,12 @@ class GeneralizedLinearRegressionSummary private[regression] (
*/
lazy val nullDeviance: Double = {
val w = if (model.getWeightCol.isEmpty) lit(1.0) else col(model.getWeightCol)

val wtdmu: Double = if (model.getFitIntercept) {
val t1 = predictions.select(sum(w.multiply(col(model.getLabelCol)))).first().getDouble(0)
val t2 = predictions.select(sum(w)).first().getDouble(0)
t1 / t2
val agg = predictions.agg(sum(w.multiply(col(model.getLabelCol))), sum(w)).first()
agg.getDouble(0) / agg.getDouble(1)
} else {
link.unlink(0.0)
}

predictions.select(col(model.getLabelCol), w).rdd.map {
case Row(y: Double, weight: Double) =>
family.deviance(y, wtdmu, weight)
Expand Down Expand Up @@ -874,63 +886,48 @@ class GeneralizedLinearRegressionSummary private[regression] (
model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

family and link are members of the summary class. We could either use them here or remove them from the constructor. I like the latter approach better since it is more consistent.

1.0
} else {
val w = if (model.getWeightCol.isEmpty) lit(1.0) else col(model.getWeightCol)
val rss = predictions.select(col(model.getPredictionCol), col(model.getLabelCol), w).rdd.map {
case Row(pred: Double, label: Double, weight: Double) =>
math.pow(label - pred, 2.0) / family.variance(pred) * weight
}.sum()

val rss = pearsonResiduals.agg(sum(pow(col("pearsonResiduals"), 2.0))).first().getDouble(0)
rss / degreesOfFreedom
}

/** Akaike's "An Information Criterion" for the fitted model */
/** Akaike's "An Information Criterion"(AIC) for the fitted model */
lazy val aic: Double = {
val w = if (model.getWeightCol.isEmpty) lit(1.0) else col(model.getWeightCol)
val weightSum = predictions.select(w).agg(sum(w)).first().getAs[Double](0)
val t = predictions.select(col(model.getPredictionCol), col(model.getLabelCol), w).rdd.map {
case Row(pred: Double, label: Double, weight: Double) =>
val t = predictions.select(col(model.getLabelCol), col(predictionCol), w).rdd.map {
case Row(label: Double, pred: Double, weight: Double) =>
(label, pred, weight)
}
family.aic(t, deviance, weightSum) + 2 * rank
family.aic(t, deviance, numInstances, weightSum) + 2 * rank
}

/**
* Standard error of estimated coefficients and intercept.
*/
lazy val coefficientStandardErrors: Array[Double] = {
diagInvAtWA.map(_ * dispersion).map(math.sqrt(_))
diagInvAtWA.map(_ * dispersion).map(math.sqrt)
}

/**
* T-statistic of estimated coefficients and intercept.
*/
lazy val tValues: Array[Double] = {
if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
throw new UnsupportedOperationException(
"No t-statistic available for this LinearRegressionModel")
val estimate = if (model.getFitIntercept) {
Array.concat(model.coefficients.toArray, Array(model.intercept))
} else {
val estimate = if (model.getFitIntercept) {
Array.concat(model.coefficients.toArray, Array(model.intercept))
} else {
model.coefficients.toArray
}
estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
model.coefficients.toArray
}
estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
}

/**
* Two-sided p-value of estimated coefficients and intercept.
*/
lazy val pValues: Array[Double] = {
if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
throw new UnsupportedOperationException(
"No p-value available for this LinearRegressionModel")
if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
tValues.map { x => 2.0 * (1.0 - GD(0.0, 1.0).cdf(math.abs(x))) }
} else {
if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
tValues.map { x => 2.0 * (1.0 - GD(0.0, 1.0).cdf(math.abs(x))) }
} else {
tValues.map { x => 2.0 * (1.0 - StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x))) }
}
tValues.map { x => 2.0 * (1.0 - StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x))) }
}
}
}