Add KNN Classifier (#397)

cTatu · web-flow · commit 5e931b58f027 · 2022-08-10T09:33:24.000+02:00
* add knn classifier, tests and doc
diff --git a/dislib/classification/__init__.py b/dislib/classification/__init__.py
@@ -1,4 +1,5 @@
 from dislib.classification.csvm.base import CascadeSVM
 from dislib.trees.forest import RandomForestClassifier
+from dislib.classification.knn.base import KNeighborsClassifier
 
-__all__ = ["CascadeSVM", "RandomForestClassifier"]
+__all__ = ["CascadeSVM", "RandomForestClassifier", "KNeighborsClassifier"]
diff --git a/dislib/classification/knn/__init__.py b/dislib/classification/knn/__init__.py
diff --git a/dislib/classification/knn/base.py b/dislib/classification/knn/base.py
@@ -0,0 +1,182 @@
+import numpy as np
+from pycompss.api.api import compss_wait_on
+from pycompss.api.constraint import constraint
+from pycompss.api.parameter import COLLECTION_IN, COLLECTION_OUT, Depth, Type
+from pycompss.api.task import task
+from sklearn.base import BaseEstimator
+from dislib.data.array import Array
+from dislib.neighbors import NearestNeighbors
+from sklearn.metrics import accuracy_score
+
+from collections import defaultdict
+
+
+class KNeighborsClassifier(BaseEstimator):
+    """Classifier implementing the k-nearest neighbors vote.
+    Parameters
+    ----------
+    n_neighbors : int, default=5
+        Number of neighbors to use by default for :meth:`kneighbors` queries.
+    weights : {'uniform', 'distance'} or callable, default='uniform'
+        Weight function used in prediction.  Possible values:
+        - 'uniform' : uniform weights.  All points in each neighborhood
+          are weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - [callable] : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator used when shuffling the
+        data for probability estimates. If int, random_state is the seed used
+        by the random number generator; If RandomState instance, random_state
+        is the random number generator; If None, the random number generator is
+        the RandomState instance used by np.random.
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+    .. warning::
+       Regarding the Nearest Neighbors algorithms, if it is found that two
+       neighbors, neighbor `k+1` and `k`, have identical distances
+       but different labels, the results will depend on the ordering of the
+       training data.
+    https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
+    Examples
+    --------
+    >>> import dislib as ds
+    >>> from dislib.classification import KNeighborsClassifier
+    >>> import numpy as np
+    >>>
+    >>>
+    >>> if __name__ == '__main__':
+    >>>     x = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    >>>     y = np.array([1, 1, 2, 2])
+    >>>     train_data = ds.array(x, block_size=(4, 2))
+    >>>     train_labels = ds.array(y, block_size=(1, 2))
+    >>>     knn = KNeighborsClassifier(n_neighbors=3)
+    >>>     knn.fit(train_data, train_labels)
+    >>>     test_data = ds.array(np.array([[-0.8, -1]]), block_size=(1, 2))
+    >>>     y_pred = knn.predict(test_data)
+    >>>     print(y_pred)
+    """
+
+    def __init__(self, n_neighbors: int = 5, weights: str = 'uniform',
+                 random_state=None):
+
+        self.n_neighbors = n_neighbors
+        self.weights = weights
+        self.random_state = random_state
+        self.nn = NearestNeighbors(n_neighbors)
+
+    def fit(self, x: Array, y: Array):
+        """ Fit the model using training data.
+
+        Parameters
+        ----------
+        x : ds-array, shape=(n_samples, n_features)
+            Training data.
+        y : ds-array, shape=(n_samples, 1)
+            Class labels of x.
+        Returns
+        -------
+        self : KNeighborsClassifier
+        """
+        self.y = y
+        self.nn.fit(x)
+
+        return self
+
+    def predict(self, q: Array):
+        """ Perform classification on samples.
+
+        Parameters
+        ----------
+        x : ds-array, shape=(n_samples, n_features)
+            Input samples.
+
+        Returns
+        -------
+        y : ds-array, shape(n_samples, 1)
+            Class labels of x.
+        """
+        dist, ind = self.nn.kneighbors(q)
+
+        out_blocks = Array._get_out_blocks(self.y._n_blocks)
+
+        _indices_to_classes(ind._blocks, self.y._blocks, dist._blocks,
+                            out_blocks, self.weights)
+
+        return Array(blocks=out_blocks, top_left_shape=self.y._top_left_shape,
+                     reg_shape=self.y._reg_shape,
+                     shape=self.y.shape, sparse=False)
+
+    def score(self, q: Array, y: Array, collect=False):
+        """
+        Returns the mean accuracy on the given test data and labels.
+
+        Parameters
+        ----------
+        x : ds-array, shape=(n_samples, n_features)
+            Test samples.
+        y : ds-array, shape=(n_samples, 1)
+            True labels for x.
+        collect : bool, optional (default=False)
+            When True, a synchronized result is returned.
+
+        Returns
+        -------
+        score : float (as future object)
+            Mean accuracy of self.predict(x) wrt. y.
+        """
+
+        y_pred = self.predict(q)
+        score = _get_score(y._blocks, y_pred._blocks)
+
+        return compss_wait_on(score) if collect else score
+
+
+@constraint(computing_units="${ComputingUnits}")
+@task(ind_blocks={Type: COLLECTION_IN, Depth: 2},
+      y_blocks={Type: COLLECTION_IN, Depth: 2},
+      dist_blocks={Type: COLLECTION_IN, Depth: 2},
+      out_blocks={Type: COLLECTION_OUT, Depth: 2})
+def _indices_to_classes(ind_blocks, y_blocks, dist_blocks,
+                        out_blocks, weights):
+    ind = Array._merge_blocks(ind_blocks)
+    y = Array._merge_blocks(y_blocks).flatten()
+    dist = Array._merge_blocks(dist_blocks)
+
+    classes = y[ind]
+
+    final_class = []
+    for crow, drow in zip(classes, dist):
+        d = defaultdict(int)
+        crow = crow.flatten()
+        for j in range(ind.shape[1]):
+
+            if weights == 'uniform':
+                w = 1
+            else:
+                w = (drow[j] + np.finfo(drow.dtype).eps)
+
+            d[crow[j]] += 1/w
+
+        final_class.append(max(d, key=d.get))
+
+    blocks = np.array_split(final_class, len(y_blocks))
+
+    for i in range(len(y_blocks)):
+        out_blocks[i][0] = np.expand_dims(blocks[i][:], axis=1)
+
+
+@constraint(computing_units="${ComputingUnits}")
+@task(y_blocks={Type: COLLECTION_IN, Depth: 2},
+      ypred_blocks={Type: COLLECTION_IN, Depth: 2},
+      returns=float)
+def _get_score(y_blocks, ypred_blocks):
+    y = Array._merge_blocks(y_blocks).flatten()
+    y_pred = Array._merge_blocks(ypred_blocks).flatten()
+
+    return accuracy_score(y, y_pred)
diff --git a/dislib/decomposition/pca/base.py b/dislib/decomposition/pca/base.py
@@ -153,8 +153,10 @@ def _fit_eig(self, x):
 
         self.components_ = Array(vec_blocks, bshape, bshape,
                                  (shape1, x.shape[1]), False)
-        self.explained_variance_ = Array(val_blocks, bshape, bshape,
-                                         (1, shape1), False)
+
+        ex_var_bshape = (1, bshape)
+        self.explained_variance_ = Array(val_blocks, ex_var_bshape,
+                                         ex_var_bshape, (1, shape1), False)
 
         return self
 
@@ -247,8 +249,11 @@ def _decompose(covariance_matrix, n_components, bsize, val_blocks, vec_blocks):
     signs = np.sign(eig_vec[range(len(eig_vec)), max_abs_cols])
     eig_vec *= signs[:, np.newaxis]
 
+    if len(eig_val.shape) == 1:
+        eig_val = np.expand_dims(eig_val, axis=0)
+
     for i in range(len(vec_blocks)):
-        val_blocks[0][i] = eig_val[i * bsize:(i + 1) * bsize]
+        val_blocks[0][i] = eig_val[:, i * bsize:(i + 1) * bsize]
 
         for j in range(len(vec_blocks[i])):
             vec_blocks[i][j] = \
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,10 +1,13 @@
-FROM ubuntu:18.04
+FROM ubuntu:20.04
 MAINTAINER COMPSs Support <support-compss@bsc.es>
 
 # =============================================================================
 # Configuration required to use the image for jenkins testing
 # =============================================================================
 
+ENV DEBIAN_FRONTEND noninteractive
+ENV COMPSS_LOAD_SOURCE false
+
 RUN apt-get update && \
 # Install Packages
     apt-get install -y --no-install-recommends \
@@ -40,13 +43,15 @@ RUN apt-get update && \
 # Extrae dependencies
     libxml2 gfortran libpapi-dev papi-tools \
 # Misc. dependencies
-    openmpi-bin openmpi-doc libopenmpi-dev uuid-runtime curl bc cmake \
+    openmpi-bin openmpi-doc libopenmpi-dev uuid-runtime curl bc cmake && \
 # Python-binding and dislib dependencies
-    python3-dev python3-pip python3-setuptools && \
-    pip3 install wheel dill decorator coverage numpy==1.15.4 ipython==7.9.0 \
-    scipy==1.3.0 jupyter==1.0.0 scikit-learn==0.19.1 pandas==0.23.1 \
-    matplotlib==2.2.3 flake8 codecov parameterized && \
-    pip3 install cvxpy==1.1.5 && \
+    apt-get install -y python3 python3-pip python3-dev graphviz-dev && \
+    python3 -m pip install coverage ipython flake8 codecov parameterized cvxpy==1.1.5 && \
+    python3 -m pip install pycodestyle pydocstyle  mpi4py  numpy dill guppy3 memory_profiler matplotlib decorator \
+            jupyter pytest nbval pytest-cov pytest-notebook ipyparallel jupyter_nbextensions_configurator jupyterlab \ 
+            pytest nbval pytest-cov pytest-html-profiling pytest-metadata pytest-profiling pytest-subprocess pytest-sugar spacy  && \
+    python3 -m ipykernel install && \
+    python3 -m spacy download en_core_web_sm && \
 # Configure user environment
 # =============================================================================
 # System configuration
@@ -57,12 +62,11 @@ RUN apt-get update && \
     echo "LD_LIBRARY_PATH=/usr/lib/openmpi/lib" >> /etc/environment && \
     mkdir /run/sshd && \
 # Clone framework files for installation
-    git clone --depth=1 --branch 2.8 https://github.com/bsc-wdc/compss.git framework && \
+    git clone --depth=1 --branch 3.0 https://github.com/bsc-wdc/compss.git framework && \
 # Install COMPSs
     cd /framework && \
     ./submodules_get.sh && \
-    ./submodules_patch.sh && \
-    sudo -E /framework/builders/buildlocal -NpAKT /opt/COMPSs && \
+    sudo -E /framework/builders/buildlocal -NpKT /opt/COMPSs && \
     rm -rf /framework /root/.m2 /root/.cache /home/jenkins/.COMPSs /tmp/* && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/docs/source/api-reference.rst b/docs/source/api-reference.rst
@@ -78,6 +78,9 @@ dislib.classification: Classification
 :class:`classification.CascadeSVM <dislib.classification.csvm.base.CascadeSVM>`
 - Distributed support vector classification using a cascade of classifiers.
 
+:class:`classification.KNeighborsClassifier <dislib.classification.knn.base.KNeighborsClassifier>`
+- Distributed K neighbors classification using partial classifiers.
+
 
 dislib.cluster: Clustering
 --------------------------
diff --git a/docs/source/dislib.classification.knn.rst b/docs/source/dislib.classification.knn.rst
@@ -0,0 +1,7 @@
+dislib.classification.CascadeSVM
+================================
+
+.. automodule:: dislib.classification.knn.base
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,4 @@ numpydoc>=0.8.0
 cvxpy>=1.1.5
 cbor2>=5.4.0
 pandas>=0.24.2
-matplotlib>=2.2.3
+matplotlib>=2.2.3
diff --git a/tests/test_knn.py b/tests/test_knn.py
@@ -2,46 +2,47 @@
 
 import numpy as np
 from scipy.sparse import csr_matrix
-from sklearn.neighbors import NearestNeighbors as SKNearestNeighbors
+from sklearn.neighbors import KNeighborsClassifier as skKNeighborsClassifier
+from sklearn.datasets import make_classification
 
 import dislib as ds
-from dislib.neighbors import NearestNeighbors
-from tests import BaseTimedTestCase
+from dislib.classification import KNeighborsClassifier
 
 
-class NearestNeighborsTest(BaseTimedTestCase):
+class KNearestNeighborsTest(unittest.TestCase):
+
     def test_kneighbors(self):
         """ Tests kneighbors against scikit-learn """
-        x = np.random.random((1500, 5))
-        data = ds.array(x, block_size=(500, 3))
-        q_data = ds.array(x, block_size=(101, 2))
 
-        knn = NearestNeighbors(n_neighbors=10)
-        knn.fit(data)
-        dist, ind = knn.kneighbors(q_data)
+        X, Y = make_classification(n_samples=200, n_features=5)
+        x, y = ds.array(X, (50, 5)), ds.array(Y, (50, 1))
+
+        knn = KNeighborsClassifier(n_neighbors=3)
+        knn.fit(x, y)
+        ds_y_hat = knn.predict(x)
+        knn.score(x, y)
 
-        sknn = SKNearestNeighbors(n_neighbors=10)
-        sknn.fit(X=x)
-        skdist, skind = sknn.kneighbors(X=x)
+        sknn = skKNeighborsClassifier(n_neighbors=3)
+        sknn.fit(X, Y)
+        sk_y_hat = sknn.predict(X)
 
-        self.assertTrue(np.allclose(dist.collect(), skdist, atol=1e-7))
-        self.assertTrue(np.array_equal(ind.collect(), skind))
+        self.assertTrue(np.all(ds_y_hat.collect() == sk_y_hat))
 
     def test_kneighbors_sparse(self):
         """ Tests kneighbors against scikit-learn with sparse data """
-        x = csr_matrix(np.random.random((1500, 5)))
-        data = ds.array(x, block_size=(500, 5))
+        X, Y = make_classification(n_samples=200, n_features=5)
+        X, Y = csr_matrix(X), Y
+        x, y = ds.array(X, (50, 5)), ds.array(Y, (50, 1))
 
-        knn = NearestNeighbors(n_neighbors=10)
-        knn.fit(data)
-        dist, ind = knn.kneighbors(data)
+        knn = KNeighborsClassifier(n_neighbors=3, weights='')
+        knn.fit(x, y)
+        ds_y_hat = knn.predict(x)
 
-        sknn = SKNearestNeighbors(n_neighbors=10)
-        sknn.fit(X=x)
-        skdist, skind = sknn.kneighbors(X=x)
+        sknn = skKNeighborsClassifier(n_neighbors=3, weights='distance')
+        sknn.fit(X, Y)
+        sk_y_hat = sknn.predict(X)
 
-        self.assertTrue(np.allclose(dist.collect(), skdist, atol=1e-7))
-        self.assertTrue(np.array_equal(ind.collect(), skind))
+        self.assertTrue(np.all(ds_y_hat.collect() == sk_y_hat))
 
 
 def main():
diff --git a/tests/test_nn.py b/tests/test_nn.py