Make dump_svmlight_file support sparse y

kernc · kernc · Feb 17, 2016 · Feb 18, 2016 · Jan 15, 2016 · Feb 29, 2016
commit 22d7cd5cd07e301461d635aa4235c7d10135a4a9
diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py
@@ -276,7 +276,8 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
 
 
 def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id):
-    is_sp = int(hasattr(X, "tocsr"))
+    X_is_sp = int(hasattr(X, "tocsr"))
+    y_is_sp = int(hasattr(y, "tocsr"))
     if X.dtype.kind == 'i':
         value_pattern = u("%d:%d")
     else:
@@ -302,7 +303,7 @@ def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id):
         f.writelines(b("# %s\n" % line) for line in comment.splitlines())
 
     for i in range(X.shape[0]):
-        if is_sp:
+        if X_is_sp:
             span = slice(X.indptr[i], X.indptr[i + 1])
             row = zip(X.indices[span], X.data[span])
         else:
@@ -312,10 +313,16 @@ def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id):
         s = " ".join(value_pattern % (j + one_based, x) for j, x in row)
 
         if multilabel:
-            nz_labels = np.where(y[i] != 0)[0]
+            if y_is_sp:
+                nz_labels = y[i].nonzero()[1]
+            else:
+                nz_labels = np.where(y[i] != 0)[0]
             labels_str = ",".join(label_pattern % j for j in nz_labels)
         else:
-            labels_str = label_pattern % y[i]
+            if y_is_sp:
+                labels_str = label_pattern % y.data[i]
+            else:
+                labels_str = label_pattern % y[i]
 
         if query_id is not None:
             feat = (labels_str, query_id[i], s)
@@ -341,9 +348,10 @@ def dump_svmlight_file(X, y, f,  zero_based=True, comment=None, query_id=None,
         Training vectors, where n_samples is the number of samples and
         n_features is the number of features.
 
-    y : array-like, shape = [n_samples] or [n_samples, n_labels]
-        Target values. Class labels must be an integer or float, or array-like
-        objects of integer or float for multilabel classifications.
+    y : {array-like, sparse matrix}, shape = [n_samples (, n_labels)]
+        Target values. Class labels must be an
+        integer or float, or array-like objects of integer or float for
+        multilabel classifications.
 
     f : string or file-like in binary mode
         If string, specifies the path that will contain the data.
@@ -385,19 +393,31 @@ def dump_svmlight_file(X, y, f,  zero_based=True, comment=None, query_id=None,
         if six.b("\0") in comment:
             raise ValueError("comment string contains NUL byte")
 
-    y = np.asarray(y)
-    if y.ndim != 1 and not multilabel:
-        raise ValueError("expected y of shape (n_samples,), got %r"
-                         % (y.shape,))
+    yval = check_array(y, accept_sparse='csr', ensure_2d=False)
+    if sp.issparse(yval):
+        if yval.shape[1] != 1 and not multilabel:
+            raise ValueError("expected y of shape (n_samples, 1),"
+                             " got %r" % (yval.shape,))
+    else:
+        if yval.ndim != 1 and not multilabel:
+            raise ValueError("expected y of shape (n_samples,), got %r"
+                             % (yval.shape,))
 
     Xval = check_array(X, accept_sparse='csr')
-    if Xval.shape[0] != y.shape[0]:
+    if Xval.shape[0] != yval.shape[0]:
         raise ValueError("X.shape[0] and y.shape[0] should be the same, got"
-                         " %r and %r instead." % (Xval.shape[0], y.shape[0]))
+                         " %r and %r instead." % (Xval.shape[0], yval.shape[0]))
 
     # We had some issues with CSR matrices with unsorted indices (e.g. #1501),
     # so sort them here, but first make sure we don't modify the user's X.
     # TODO We can do this cheaper; sorted_indices copies the whole matrix.
+    if yval is y and hasattr(yval, "sorted_indices"):
+        y = yval.sorted_indices()
+    else:
+        y = yval
+        if hasattr(y, "sort_indices"):
+            y.sort_indices()
+
     if Xval is X and hasattr(Xval, "sorted_indices"):
         X = Xval.sorted_indices()
     else:

diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py
@@ -2,6 +2,7 @@
 import gzip
 from io import BytesIO
 import numpy as np
+import scipy.sparse as sp
 import os
 import shutil
 from tempfile import NamedTemporaryFile
@@ -200,67 +201,84 @@ def test_invalid_filename():
 
 
 def test_dump():
-    Xs, y = load_svmlight_file(datafile)
-    Xd = Xs.toarray()
+    X_sparse, y_dense = load_svmlight_file(datafile)
+    X_dense = X_sparse.toarray()
+    y_sparse = sp.csr_matrix(y_dense)
 
     # slicing a csr_matrix can unsort its .indices, so test that we sort
     # those correctly
-    Xsliced = Xs[np.arange(Xs.shape[0])]
-
-    for X in (Xs, Xd, Xsliced):
-        for zero_based in (True, False):
-            for dtype in [np.float32, np.float64, np.int32]:
-                f = BytesIO()
-                # we need to pass a comment to get the version info in;
-                # LibSVM doesn't grok comments so they're not put in by
-                # default anymore.
-                dump_svmlight_file(X.astype(dtype), y, f, comment="test",
-                                   zero_based=zero_based)
-                f.seek(0)
-
-                comment = f.readline()
-                try:
-                    comment = str(comment, "utf-8")
-                except TypeError:  # fails in Python 2.x
-                    pass
-
-                assert_in("scikit-learn %s" % sklearn.__version__, comment)
-
-                comment = f.readline()
-                try:
-                    comment = str(comment, "utf-8")
-                except TypeError:  # fails in Python 2.x
-                    pass
-
-                assert_in(["one", "zero"][zero_based] + "-based", comment)
-
-                X2, y2 = load_svmlight_file(f, dtype=dtype,
-                                            zero_based=zero_based)
-                assert_equal(X2.dtype, dtype)
-                assert_array_equal(X2.sorted_indices().indices, X2.indices)
-                if dtype == np.float32:
-                    assert_array_almost_equal(
+    X_sliced = X_sparse[np.arange(X_sparse.shape[0])]
+    y_sliced = y_sparse[np.arange(y_sparse.shape[0])]
+
+    for X in (X_sparse, X_dense, X_sliced):
+        for y in (y_sparse, y_dense, y_sliced):
+            for zero_based in (True, False):
+                for dtype in [np.float32, np.float64, np.int32]:
+                    f = BytesIO()
+                    # we need to pass a comment to get the version info in;
+                    # LibSVM doesn't grok comments so they're not put in by
+                    # default anymore.
+
+                    if (sp.issparse(y) and y.shape[0] == 1):
+                        # make sure y's shape is: (n_samples, n_labels)
+                        # when it is sparse
+                        y = y.T
+
+                    dump_svmlight_file(X.astype(dtype), y, f, comment="test",
+                                       zero_based=zero_based)
+                    f.seek(0)
+
+                    comment = f.readline()
+                    try:
+                        comment = str(comment, "utf-8")
+                    except TypeError:  # fails in Python 2.x
+                        pass
+
+                    assert_in("scikit-learn %s" % sklearn.__version__, comment)
+
+                    comment = f.readline()
+                    try:
+                        comment = str(comment, "utf-8")
+                    except TypeError:  # fails in Python 2.x
+                        pass
+
+                    assert_in(["one", "zero"][zero_based] + "-based", comment)
+
+                    X2, y2 = load_svmlight_file(f, dtype=dtype,
+                                                zero_based=zero_based)
+                    assert_equal(X2.dtype, dtype)
+                    assert_array_equal(X2.sorted_indices().indices, X2.indices)
+
+                    X2_dense = X2.toarray()
+
+                    if dtype == np.float32:
                         # allow a rounding error at the last decimal place
-                        Xd.astype(dtype), X2.toarray(), 4)
-                else:
-                    assert_array_almost_equal(
+                        assert_array_almost_equal(
+                            X_dense.astype(dtype), X2_dense, 4)
+                        assert_array_almost_equal(
+                            y_dense.astype(dtype), y2, 4)
+                    else:
                         # allow a rounding error at the last decimal place
-                        Xd.astype(dtype), X2.toarray(), 15)
-                assert_array_equal(y, y2)
+                        assert_array_almost_equal(
+                            X_dense.astype(dtype), X2_dense, 15)
+                        assert_array_almost_equal(
+                            y_dense.astype(dtype), y2, 15)
 
 
 def test_dump_multilabel():
     X = [[1, 0, 3, 0, 5],
          [0, 0, 0, 0, 0],
          [0, 5, 0, 1, 0]]
-    y = [[0, 1, 0], [1, 0, 1], [1, 1, 0]]
-    f = BytesIO()
-    dump_svmlight_file(X, y, f, multilabel=True)
-    f.seek(0)
-    # make sure it dumps multilabel correctly
-    assert_equal(f.readline(), b("1 0:1 2:3 4:5\n"))
-    assert_equal(f.readline(), b("0,2 \n"))
-    assert_equal(f.readline(), b("0,1 1:5 3:1\n"))
+    y_dense = [[0, 1, 0], [1, 0, 1], [1, 1, 0]]
+    y_sparse = sp.csr_matrix(y_dense)
+    for y in [y_dense, y_sparse]:
+        f = BytesIO()
+        dump_svmlight_file(X, y, f, multilabel=True)
+        f.seek(0)
+        # make sure it dumps multilabel correctly
+        assert_equal(f.readline(), b("1 0:1 2:3 4:5\n"))
+        assert_equal(f.readline(), b("0,2 \n"))
+        assert_equal(f.readline(), b("0,1 1:5 3:1\n"))
 
 
 def test_dump_concise():