dev233
diff --git a/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions b/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.mailmap‎
Lines changed: 12 additions & 0 deletions b/‎.mailmap‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎benchmarks/bench_covertype.py‎
Lines changed: 23 additions & 46 deletions b/‎benchmarks/bench_covertype.py‎
Lines changed: 23 additions & 46 deletions
diff --git a/‎doc/conf.py‎
Lines changed: 2 additions & 2 deletions b/‎doc/conf.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/datasets/covtype.rst‎
Lines changed: 19 additions & 0 deletions b/‎doc/datasets/covtype.rst‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎doc/datasets/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎doc/datasets/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/modules/classes.rst‎
Lines changed: 1 addition & 0 deletions b/‎doc/modules/classes.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/modules/feature_extraction.rst‎
Lines changed: 18 additions & 8 deletions b/‎doc/modules/feature_extraction.rst‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎doc/modules/linear_model.rst‎
Lines changed: 1 addition & 1 deletion b/‎doc/modules/linear_model.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/modules/metrics.rst‎
Lines changed: 2 additions & 2 deletions b/‎doc/modules/metrics.rst‎
Lines changed: 2 additions & 2 deletions
@@ -16,6 +16,7 @@
 /sklearn/utils/arraybuilder.c -diff
 /sklearn/utils/arrayfuncs.c -diff
 /sklearn/utils/graph_shortest_path.c -diff
+/sklearn/utils/lgamma.c -diff
 /sklearn/utils/murmurhash.c -diff
 /sklearn/utils/seq_dataset.c -diff
 /sklearn/utils/sparsefuncs.c -diff
 
@@ -68,3 +68,15 @@ Tim Sheerman-Chase <[email protected]> Tim Sheerman-Chase <ts00051@t
 Vincent Schut <[email protected]> Vincent Schut <vincent@TIMO.(none)>
 iBayer <[email protected]> ibayer <[email protected]>
 Wei Li <[email protected]> kuantkid <[email protected]>
+Wei Li <[email protected]> Wei LI <[email protected]>
+Hrishikesh Huilgolkar <[email protected]> <hrishikesh@QE-IND-WKS007.(none)>
+Brian Cheung <[email protected]> <[email protected]>
+Brian Cheung <[email protected]> cow <briancheung>
+Brian Cheung <[email protected]> cow <cow@rusty.(none)>
+Diego Molla <[email protected]> <diego@diego-desktop.(none)>
+Michael EICKENBERG <[email protected]> Michael EICKENBERG <[email protected]>
+Michael EICKENBERG <[email protected]> Michael <[email protected]>
+Nelle Varoquaux <[email protected]> <nelle@[email protected]><nelle@[email protected]>
+Noel Dawe <[email protected]> <[email protected]>
+Noel Dawe <[email protected]> <[email protected]>
+X006 <x006@x006-icsl.(none)> x006 <x006@x006laptop.(none)>
@@ -44,25 +44,29 @@
 
 print __doc__
 
-# Author: Peter Prettenhoer <[email protected]>
+# Author: Peter Prettenhofer <[email protected]>
 # License: BSD Style.
 
-# $Id$
-
-from time import time
+import logging
 import os
 import sys
-import numpy as np
+from time import time
 from optparse import OptionParser
 
+import numpy as np
+
+from sklearn.datasets import fetch_covtype
 from sklearn.svm import LinearSVC
 from sklearn.linear_model import SGDClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn import metrics
 from sklearn.externals.joblib import Memory
-from sklearn.utils import check_random_state
+
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s %(levelname)s %(message)s')
+logger = logging.getLogger(__name__)
 
 op = OptionParser()
 op.add_option("--classifiers",
@@ -80,8 +84,7 @@
 # estimators.
 op.add_option("--random-seed",
               dest="random_seed", default=13, type=int,
-              help="Common seed used by random number generator."
-              )
+              help="Common seed used by random number generator.")
 
 op.print_help()
 
@@ -97,57 +100,31 @@
 joblib_cache_folder = os.path.join(bench_folder, 'bench_covertype_data')
 m = Memory(joblib_cache_folder, mmap_mode='r')
 
-# Set seed for rng
-rng = check_random_state(opts.random_seed)
-
 
 # Load the data, then cache and memmap the train/test split
 @m.cache
 def load_data(dtype=np.float32, order='F'):
-    ######################################################################
-    ## Download the data, if not already on disk
-    if not os.path.exists(original_archive):
-        # Download the data
-        import urllib
-        print "Downloading data, Please Wait (11MB)..."
-        opener = urllib.urlopen(
-            'http://archive.ics.uci.edu/ml/'
-            'machine-learning-databases/covtype/covtype.data.gz')
-        open(original_archive, 'wb').write(opener.read())
-
     ######################################################################
     ## Load dataset
     print("Loading dataset...")
-    import gzip
-    f = gzip.open(original_archive)
-    X = np.fromstring(f.read().replace(",", " "), dtype=dtype, sep=" ",
-                      count=-1)
-    X = X.reshape((581012, 55))
+    data = fetch_covtype(download_if_missing=True, shuffle=True,
+                         random_state=opts.random_seed)
+    X, y = data.data, data.target
     if order.lower() == 'f':
         X = np.asfortranarray(X)
-    f.close()
 
     # class 1 vs. all others.
-    y = np.ones(X.shape[0]) * -1
-    y[np.where(X[:, -1] == 1)] = 1
-    X = X[:, :-1]
+    y[np.where(y != 1)] = -1
 
     ######################################################################
     ## Create train-test split (as [Joachims, 2006])
-    print("Creating train-test split...")
-    idx = np.arange(X.shape[0])
-    rng.shuffle(idx)
-    train_idx = idx[:522911]
-    test_idx = idx[522911:]
+    logger.info("Creating train-test split...")
+    n_train = 522911
 
-    X_train = X[train_idx]
-    y_train = y[train_idx]
-    X_test = X[test_idx]
-    y_test = y[test_idx]
-
-    # free memory
-    del X
-    del y
+    X_train = X[:n_train]
+    y_train = y[:n_train]
+    X_test = X[n_train:]
+    y_test = y[n_train:]
 
     ######################################################################
     ## Standardize first 10 features (the numerical ones)
@@ -204,7 +181,7 @@ def benchmark(clf):
     'dual': False,
     'tol': 1e-3,
     "random_state": opts.random_seed,
-    }
+}
 classifiers['liblinear'] = LinearSVC(**liblinear_parameters)
 
 ######################################################################
@@ -218,7 +195,7 @@ def benchmark(clf):
     'n_iter': 2,
     'n_jobs': opts.n_jobs,
     "random_state": opts.random_seed,
-    }
+}
 classifiers['SGD'] = SGDClassifier(**sgd_parameters)
 
 ######################################################################
 
@@ -67,7 +67,7 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.13'
+version = '0.13.1'
 # The full version, including alpha/beta/rc tags.
 import sklearn
 release = sklearn.__version__
@@ -121,7 +121,7 @@
 # further.  For a list of options available for each theme, see the
 # documentation.
 html_theme_options = {'oldversion': False, 'collapsiblesidebar': True,
-                      'google_analytics': True, 'surveybanner': True}
+                      'google_analytics': True}
 
 # Add any paths that contain custom themes here, relative to this directory.
 html_theme_path = ['themes']
 
@@ -0,0 +1,19 @@
+
+.. _covtype:
+
+Forest covertypes
+=================
+
+The samples in this dataset correspond to 30×30m patches of forest in the US,
+collected for the task of predicting each patch's cover type,
+i.e. the dominant species of tree.
+There are seven covertypes, making this a multiclass classification problem.
+Each sample has 54 features, described on the
+`dataset's homepage <http://archive.ics.uci.edu/ml/datasets/Covertype>`_.
+Some of the features are boolean indicators,
+while others are discrete or continuous measurements.
+
+``sklearn.datasets.fetch_covtype`` will load the covertype dataset;
+it returns a ``Bunch`` object with the feature matrix in the ``data`` member
+and the target values in ``target``.
+The dataset will be downloaded from the web if necessary.
@@ -180,3 +180,4 @@ features::
 
 .. include:: labeled_faces.rst
 
+.. include:: covtype.rst
@@ -938,6 +938,7 @@ Pairwise metrics
    :template: function.rst
 
    preprocessing.add_dummy_feature
+   preprocessing.balance_weights
    preprocessing.binarize
    preprocessing.normalize
    preprocessing.scale
 
@@ -124,9 +124,9 @@ and has no ``inverse_transform`` method.
 
 Since the hash function might cause collisions between (unrelated) features,
 a signed hash function is used and the sign of the hash value
-determines the sign of the value stored in the output matrix for a feature;
-this way, collisions are likely to cancel out rather than accumulate error,
-and the expected mean of any output feature's value is zero
+determines the sign of the value stored in the output matrix for a feature.
+This way, collisions are likely to cancel out rather than accumulate error,
+and the expected mean of any output feature's value is zero.
 
 If ``non_negative=True`` is passed to the constructor,
 the absolute value is taken.
@@ -139,14 +139,20 @@ or ``chi2`` feature selectors that expect non-negative inputs.
 ``(feature, value)`` pairs, or strings,
 depending on the constructor parameter ``input_type``.
 Mapping are treated as lists of ``(feature, value)`` pairs,
-while single strings have an implicit value of 1.
-If a feature occurs multiple times in a sample, the values will be summed.
+while single strings have an implicit value of 1,
+so ``['feat1', 'feat2', 'feat3']`` is interpreted as
+``[('feat1', 1), ('feat2', 1), ('feat3', 1)]``.
+If a single feature occurs multiple times in a sample,
+the associated values will be summed
+(so ``('feat', 2)`` and ``('feat', 3.5)`` become ``('feat', 5.5)``).
+The output from :class:`FeatureHasher` is always a ``scipy.sparse`` matrix
+in the CSR format.
+
 Feature hashing can be employed in document classification,
 but unlike :class:`text.CountVectorizer`,
 :class:`FeatureHasher` does not do word
-splitting or any other preprocessing except Unicode-to-UTF-8 encoding.
-The output from :class:`FeatureHasher` is always a ``scipy.sparse`` matrix
-in the CSR format.
+splitting or any other preprocessing except Unicode-to-UTF-8 encoding;
+see :ref:`hashing_vectorizer`, below, for a combined tokenizer/hasher.
 
 As an example, consider a word-level natural language processing task
 that needs features extracted from ``(token, part_of_speech)`` pairs.
@@ -193,6 +199,10 @@ to determine the column index and sign of a feature, respectively.
 The present implementation works under the assumption
 that the sign bit of MurmurHash3 is independent of its other bits.
 
+Since a simple modulo is used to transform the hash function to a column index,
+it is advisable to use a power of two as the ``n_features`` parameter;
+otherwise the features will not be mapped evenly to the columns.
+
 
 .. topic:: References:
 
 
@@ -707,7 +707,7 @@ Passive Aggressive Algorithms
 =============================
 
 The passive-aggressive algorithms are a family of algorithms for large-scale
-learning. They are similar to the Pereptron in that they do not require a
+learning. They are similar to the Perceptron in that they do not require a
 learning rate. However, contrary to the Perceptron, they include a
 regularization parameter ``C``.
 
 
@@ -1,7 +1,7 @@
 .. _metrics:
 
-Metrics, Affinities and Kernels
-===============================
+Pairwise metrics, Affinities and Kernels
+========================================
 
 The :mod:`sklearn.metrics.pairwise` submodule implements utilities to evaluate
 pairwise distances or affinity of sets of samples.
Original file line number	Diff line number	Diff line change
`@@ -180,3 +180,4 @@ features::`
`180`	`180`
`181`	`181`	`.. include:: labeled_faces.rst`
`182`	`182`
	`183`	`+.. include:: covtype.rst`