[MRG] FIX Revert the addition of ndcg_score and dcg_score (scikit-learn#9932)

jnothman · jnothman · commit 872d03d77dd1 · 2017-10-20T17:02:19.000+11:00
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -783,15 +783,13 @@ details.
    metrics.classification_report
    metrics.cohen_kappa_score
    metrics.confusion_matrix
-   metrics.dcg_score
    metrics.f1_score
    metrics.fbeta_score
    metrics.hamming_loss
    metrics.hinge_loss
    metrics.jaccard_similarity_score
    metrics.log_loss
    metrics.matthews_corrcoef
-   metrics.ndcg_score
    metrics.precision_recall_curve
    metrics.precision_recall_fscore_support
    metrics.precision_score
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
@@ -308,14 +308,6 @@ Some also work in the multilabel case:
    recall_score
    zero_one_loss
 
-Some are typically used for ranking:
-
-.. autosummary::
-   :template: function.rst
-
-   dcg_score
-   ndcg_score
-
 And some work with binary and multilabel (but not multiclass) problems:
 
 .. autosummary::
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
@@ -12,8 +12,6 @@
 from .ranking import precision_recall_curve
 from .ranking import roc_auc_score
 from .ranking import roc_curve
-from .ranking import dcg_score
-from .ranking import ndcg_score
 
 from .classification import accuracy_score
 from .classification import classification_report
@@ -118,6 +116,4 @@
     'v_measure_score',
     'zero_one_loss',
     'brier_score_loss',
-    'dcg_score',
-    'ndcg_score'
 ]
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
@@ -26,7 +26,7 @@
 
 from ..utils import assert_all_finite
 from ..utils import check_consistent_length
-from ..utils import column_or_1d, check_array, check_X_y
+from ..utils import column_or_1d, check_array
 from ..utils.multiclass import type_of_target
 from ..utils.extmath import stable_cumsum
 from ..utils.sparsefuncs import count_nonzero
@@ -788,91 +788,3 @@ def label_ranking_loss(y_true, y_score, sample_weight=None):
     loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0.
 
     return np.average(loss, weights=sample_weight)
-
-
-def dcg_score(y_true, y_score, k=5):
-    """Discounted cumulative gain (DCG) at rank K.
-
-    Parameters
-    ----------
-    y_true : array, shape = [n_samples]
-        Ground truth (true relevance labels).
-    y_score : array, shape = [n_samples]
-        Predicted scores.
-    k : int
-        Rank.
-
-    Returns
-    -------
-    score : float
-
-    References
-    ----------
-    .. [1] `Wikipedia entry for the Discounted Cumulative Gain
-           <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_
-    """
-    order = np.argsort(y_score)[::-1]
-    y_true = np.take(y_true, order[:k])
-
-    gain = 2 ** y_true - 1
-
-    discounts = np.log2(np.arange(len(y_true)) + 2)
-    return np.sum(gain / discounts)
-
-
-def ndcg_score(y_true, y_score, k=5):
-    """Normalized discounted cumulative gain (NDCG) at rank K.
-
-    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
-    recommendation system based on the graded relevance of the recommended
-    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
-    ranking of the entities.
-
-    Parameters
-    ----------
-    y_true : array, shape = [n_samples]
-        Ground truth (true labels represended as integers).
-    y_score : array, shape = [n_samples, n_classes]
-        Predicted probabilities.
-    k : int
-        Rank.
-
-    Returns
-    -------
-    score : float
-
-    Examples
-    --------
-    >>> y_true = [1, 0, 2]
-    >>> y_score = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
-    >>> ndcg_score(y_true, y_score, k=2)
-    1.0
-    >>> y_score = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
-    >>> ndcg_score(y_true, y_score, k=2)
-    0.66666666666666663
-
-    References
-    ----------
-    .. [1] `Kaggle entry for the Normalized Discounted Cumulative Gain
-           <https://www.kaggle.com/wiki/NormalizedDiscountedCumulativeGain>`_
-    """
-    y_score, y_true = check_X_y(y_score, y_true)
-
-    # Make sure we use all the labels (max between the length and the higher
-    # number in the array)
-    lb = LabelBinarizer()
-    lb.fit(np.arange(max(np.max(y_true) + 1, len(y_true))))
-    binarized_y_true = lb.transform(y_true)
-
-    if binarized_y_true.shape != y_score.shape:
-        raise ValueError("y_true and y_score have different value ranges")
-
-    scores = []
-
-    # Iterate over each y_value_true and compute the DCG score
-    for y_value_true, y_value_score in zip(binarized_y_true, y_score):
-        actual = dcg_score(y_value_true, y_value_score, k)
-        best = dcg_score(y_value_true, y_value_true, k)
-        scores.append(actual / best)
-
-    return np.mean(scores)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
@@ -29,7 +29,6 @@
 from sklearn.metrics import label_ranking_loss
 from sklearn.metrics import roc_auc_score
 from sklearn.metrics import roc_curve
-from sklearn.metrics import ndcg_score
 
 from sklearn.exceptions import UndefinedMetricWarning
 
@@ -738,38 +737,6 @@ def check_zero_or_all_relevant_labels(lrap_score):
                                    [[0.5], [0.5], [0.5], [0.5]]), 1.)
 
 
-def test_ndcg_score():
-    # Check perfect ranking
-    y_true = [1, 0, 2]
-    y_score = [
-        [0.15, 0.55, 0.2],
-        [0.7, 0.2, 0.1],
-        [0.06, 0.04, 0.9]
-    ]
-    perfect = ndcg_score(y_true, y_score)
-    assert_equal(perfect, 1.0)
-
-    # Check bad ranking with a small K
-    y_true = [0, 2, 1]
-    y_score = [
-        [0.15, 0.55, 0.2],
-        [0.7, 0.2, 0.1],
-        [0.06, 0.04, 0.9]
-    ]
-    short_k = ndcg_score(y_true, y_score, k=1)
-    assert_equal(short_k, 0.0)
-
-    # Check a random scoring
-    y_true = [2, 1, 0]
-    y_score = [
-        [0.15, 0.55, 0.2],
-        [0.7, 0.2, 0.1],
-        [0.06, 0.04, 0.9]
-    ]
-    average_ranking = ndcg_score(y_true, y_score, k=2)
-    assert_almost_equal(average_ranking, 0.63092975)
-
-
 def check_lrap_error_raised(lrap_score):
     # Raise value error if not appropriate format
     assert_raises(ValueError, lrap_score,