Skip to content

Commit 5e931b5

Browse files
authored
Add KNN Classifier (#397)
* add knn classifier, tests and doc
1 parent 2896c28 commit 5e931b5

File tree

10 files changed

+294
-40
lines changed

10 files changed

+294
-40
lines changed

dislib/classification/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from dislib.classification.csvm.base import CascadeSVM
22
from dislib.trees.forest import RandomForestClassifier
3+
from dislib.classification.knn.base import KNeighborsClassifier
34

4-
__all__ = ["CascadeSVM", "RandomForestClassifier"]
5+
__all__ = ["CascadeSVM", "RandomForestClassifier", "KNeighborsClassifier"]

dislib/classification/knn/__init__.py

Whitespace-only changes.

dislib/classification/knn/base.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
import numpy as np
2+
from pycompss.api.api import compss_wait_on
3+
from pycompss.api.constraint import constraint
4+
from pycompss.api.parameter import COLLECTION_IN, COLLECTION_OUT, Depth, Type
5+
from pycompss.api.task import task
6+
from sklearn.base import BaseEstimator
7+
from dislib.data.array import Array
8+
from dislib.neighbors import NearestNeighbors
9+
from sklearn.metrics import accuracy_score
10+
11+
from collections import defaultdict
12+
13+
14+
class KNeighborsClassifier(BaseEstimator):
15+
"""Classifier implementing the k-nearest neighbors vote.
16+
Parameters
17+
----------
18+
n_neighbors : int, default=5
19+
Number of neighbors to use by default for :meth:`kneighbors` queries.
20+
weights : {'uniform', 'distance'} or callable, default='uniform'
21+
Weight function used in prediction. Possible values:
22+
- 'uniform' : uniform weights. All points in each neighborhood
23+
are weighted equally.
24+
- 'distance' : weight points by the inverse of their distance.
25+
in this case, closer neighbors of a query point will have a
26+
greater influence than neighbors which are further away.
27+
- [callable] : a user-defined function which accepts an
28+
array of distances, and returns an array of the same shape
29+
containing the weights.
30+
random_state : int, RandomState instance or None, optional (default=None)
31+
The seed of the pseudo random number generator used when shuffling the
32+
data for probability estimates. If int, random_state is the seed used
33+
by the random number generator; If RandomState instance, random_state
34+
is the random number generator; If None, the random number generator is
35+
the RandomState instance used by np.random.
36+
Notes
37+
-----
38+
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
39+
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
40+
.. warning::
41+
Regarding the Nearest Neighbors algorithms, if it is found that two
42+
neighbors, neighbor `k+1` and `k`, have identical distances
43+
but different labels, the results will depend on the ordering of the
44+
training data.
45+
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
46+
Examples
47+
--------
48+
>>> import dislib as ds
49+
>>> from dislib.classification import KNeighborsClassifier
50+
>>> import numpy as np
51+
>>>
52+
>>>
53+
>>> if __name__ == '__main__':
54+
>>> x = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
55+
>>> y = np.array([1, 1, 2, 2])
56+
>>> train_data = ds.array(x, block_size=(4, 2))
57+
>>> train_labels = ds.array(y, block_size=(1, 2))
58+
>>> knn = KNeighborsClassifier(n_neighbors=3)
59+
>>> knn.fit(train_data, train_labels)
60+
>>> test_data = ds.array(np.array([[-0.8, -1]]), block_size=(1, 2))
61+
>>> y_pred = knn.predict(test_data)
62+
>>> print(y_pred)
63+
"""
64+
65+
def __init__(self, n_neighbors: int = 5, weights: str = 'uniform',
66+
random_state=None):
67+
68+
self.n_neighbors = n_neighbors
69+
self.weights = weights
70+
self.random_state = random_state
71+
self.nn = NearestNeighbors(n_neighbors)
72+
73+
def fit(self, x: Array, y: Array):
74+
""" Fit the model using training data.
75+
76+
Parameters
77+
----------
78+
x : ds-array, shape=(n_samples, n_features)
79+
Training data.
80+
y : ds-array, shape=(n_samples, 1)
81+
Class labels of x.
82+
Returns
83+
-------
84+
self : KNeighborsClassifier
85+
"""
86+
self.y = y
87+
self.nn.fit(x)
88+
89+
return self
90+
91+
def predict(self, q: Array):
92+
""" Perform classification on samples.
93+
94+
Parameters
95+
----------
96+
x : ds-array, shape=(n_samples, n_features)
97+
Input samples.
98+
99+
Returns
100+
-------
101+
y : ds-array, shape(n_samples, 1)
102+
Class labels of x.
103+
"""
104+
dist, ind = self.nn.kneighbors(q)
105+
106+
out_blocks = Array._get_out_blocks(self.y._n_blocks)
107+
108+
_indices_to_classes(ind._blocks, self.y._blocks, dist._blocks,
109+
out_blocks, self.weights)
110+
111+
return Array(blocks=out_blocks, top_left_shape=self.y._top_left_shape,
112+
reg_shape=self.y._reg_shape,
113+
shape=self.y.shape, sparse=False)
114+
115+
def score(self, q: Array, y: Array, collect=False):
116+
"""
117+
Returns the mean accuracy on the given test data and labels.
118+
119+
Parameters
120+
----------
121+
x : ds-array, shape=(n_samples, n_features)
122+
Test samples.
123+
y : ds-array, shape=(n_samples, 1)
124+
True labels for x.
125+
collect : bool, optional (default=False)
126+
When True, a synchronized result is returned.
127+
128+
Returns
129+
-------
130+
score : float (as future object)
131+
Mean accuracy of self.predict(x) wrt. y.
132+
"""
133+
134+
y_pred = self.predict(q)
135+
score = _get_score(y._blocks, y_pred._blocks)
136+
137+
return compss_wait_on(score) if collect else score
138+
139+
140+
@constraint(computing_units="${ComputingUnits}")
141+
@task(ind_blocks={Type: COLLECTION_IN, Depth: 2},
142+
y_blocks={Type: COLLECTION_IN, Depth: 2},
143+
dist_blocks={Type: COLLECTION_IN, Depth: 2},
144+
out_blocks={Type: COLLECTION_OUT, Depth: 2})
145+
def _indices_to_classes(ind_blocks, y_blocks, dist_blocks,
146+
out_blocks, weights):
147+
ind = Array._merge_blocks(ind_blocks)
148+
y = Array._merge_blocks(y_blocks).flatten()
149+
dist = Array._merge_blocks(dist_blocks)
150+
151+
classes = y[ind]
152+
153+
final_class = []
154+
for crow, drow in zip(classes, dist):
155+
d = defaultdict(int)
156+
crow = crow.flatten()
157+
for j in range(ind.shape[1]):
158+
159+
if weights == 'uniform':
160+
w = 1
161+
else:
162+
w = (drow[j] + np.finfo(drow.dtype).eps)
163+
164+
d[crow[j]] += 1/w
165+
166+
final_class.append(max(d, key=d.get))
167+
168+
blocks = np.array_split(final_class, len(y_blocks))
169+
170+
for i in range(len(y_blocks)):
171+
out_blocks[i][0] = np.expand_dims(blocks[i][:], axis=1)
172+
173+
174+
@constraint(computing_units="${ComputingUnits}")
175+
@task(y_blocks={Type: COLLECTION_IN, Depth: 2},
176+
ypred_blocks={Type: COLLECTION_IN, Depth: 2},
177+
returns=float)
178+
def _get_score(y_blocks, ypred_blocks):
179+
y = Array._merge_blocks(y_blocks).flatten()
180+
y_pred = Array._merge_blocks(ypred_blocks).flatten()
181+
182+
return accuracy_score(y, y_pred)

dislib/decomposition/pca/base.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,10 @@ def _fit_eig(self, x):
153153

154154
self.components_ = Array(vec_blocks, bshape, bshape,
155155
(shape1, x.shape[1]), False)
156-
self.explained_variance_ = Array(val_blocks, bshape, bshape,
157-
(1, shape1), False)
156+
157+
ex_var_bshape = (1, bshape)
158+
self.explained_variance_ = Array(val_blocks, ex_var_bshape,
159+
ex_var_bshape, (1, shape1), False)
158160

159161
return self
160162

@@ -247,8 +249,11 @@ def _decompose(covariance_matrix, n_components, bsize, val_blocks, vec_blocks):
247249
signs = np.sign(eig_vec[range(len(eig_vec)), max_abs_cols])
248250
eig_vec *= signs[:, np.newaxis]
249251

252+
if len(eig_val.shape) == 1:
253+
eig_val = np.expand_dims(eig_val, axis=0)
254+
250255
for i in range(len(vec_blocks)):
251-
val_blocks[0][i] = eig_val[i * bsize:(i + 1) * bsize]
256+
val_blocks[0][i] = eig_val[:, i * bsize:(i + 1) * bsize]
252257

253258
for j in range(len(vec_blocks[i])):
254259
vec_blocks[i][j] = \

docker/Dockerfile

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1-
FROM ubuntu:18.04
1+
FROM ubuntu:20.04
22
MAINTAINER COMPSs Support <[email protected]>
33

44
# =============================================================================
55
# Configuration required to use the image for jenkins testing
66
# =============================================================================
77

8+
ENV DEBIAN_FRONTEND noninteractive
9+
ENV COMPSS_LOAD_SOURCE false
10+
811
RUN apt-get update && \
912
# Install Packages
1013
apt-get install -y --no-install-recommends \
@@ -40,13 +43,15 @@ RUN apt-get update && \
4043
# Extrae dependencies
4144
libxml2 gfortran libpapi-dev papi-tools \
4245
# Misc. dependencies
43-
openmpi-bin openmpi-doc libopenmpi-dev uuid-runtime curl bc cmake \
46+
openmpi-bin openmpi-doc libopenmpi-dev uuid-runtime curl bc cmake && \
4447
# Python-binding and dislib dependencies
45-
python3-dev python3-pip python3-setuptools && \
46-
pip3 install wheel dill decorator coverage numpy==1.15.4 ipython==7.9.0 \
47-
scipy==1.3.0 jupyter==1.0.0 scikit-learn==0.19.1 pandas==0.23.1 \
48-
matplotlib==2.2.3 flake8 codecov parameterized && \
49-
pip3 install cvxpy==1.1.5 && \
48+
apt-get install -y python3 python3-pip python3-dev graphviz-dev && \
49+
python3 -m pip install coverage ipython flake8 codecov parameterized cvxpy==1.1.5 && \
50+
python3 -m pip install pycodestyle pydocstyle mpi4py numpy dill guppy3 memory_profiler matplotlib decorator \
51+
jupyter pytest nbval pytest-cov pytest-notebook ipyparallel jupyter_nbextensions_configurator jupyterlab \
52+
pytest nbval pytest-cov pytest-html-profiling pytest-metadata pytest-profiling pytest-subprocess pytest-sugar spacy && \
53+
python3 -m ipykernel install && \
54+
python3 -m spacy download en_core_web_sm && \
5055
# Configure user environment
5156
# =============================================================================
5257
# System configuration
@@ -57,12 +62,11 @@ RUN apt-get update && \
5762
echo "LD_LIBRARY_PATH=/usr/lib/openmpi/lib" >> /etc/environment && \
5863
mkdir /run/sshd && \
5964
# Clone framework files for installation
60-
git clone --depth=1 --branch 2.8 https://github.com/bsc-wdc/compss.git framework && \
65+
git clone --depth=1 --branch 3.0 https://github.com/bsc-wdc/compss.git framework && \
6166
# Install COMPSs
6267
cd /framework && \
6368
./submodules_get.sh && \
64-
./submodules_patch.sh && \
65-
sudo -E /framework/builders/buildlocal -NpAKT /opt/COMPSs && \
69+
sudo -E /framework/builders/buildlocal -NpKT /opt/COMPSs && \
6670
rm -rf /framework /root/.m2 /root/.cache /home/jenkins/.COMPSs /tmp/* && \
6771
rm -rf /var/lib/apt/lists/*
6872

docs/source/api-reference.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ dislib.classification: Classification
7878
:class:`classification.CascadeSVM <dislib.classification.csvm.base.CascadeSVM>`
7979
- Distributed support vector classification using a cascade of classifiers.
8080

81+
:class:`classification.KNeighborsClassifier <dislib.classification.knn.base.KNeighborsClassifier>`
82+
- Distributed K neighbors classification using partial classifiers.
83+
8184

8285
dislib.cluster: Clustering
8386
--------------------------
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
dislib.classification.CascadeSVM
2+
================================
3+
4+
.. automodule:: dislib.classification.knn.base
5+
:members:
6+
:undoc-members:
7+
:show-inheritance:

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ numpydoc>=0.8.0
55
cvxpy>=1.1.5
66
cbor2>=5.4.0
77
pandas>=0.24.2
8-
matplotlib>=2.2.3
8+
matplotlib>=2.2.3

tests/test_knn.py

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,46 +2,47 @@
22

33
import numpy as np
44
from scipy.sparse import csr_matrix
5-
from sklearn.neighbors import NearestNeighbors as SKNearestNeighbors
5+
from sklearn.neighbors import KNeighborsClassifier as skKNeighborsClassifier
6+
from sklearn.datasets import make_classification
67

78
import dislib as ds
8-
from dislib.neighbors import NearestNeighbors
9-
from tests import BaseTimedTestCase
9+
from dislib.classification import KNeighborsClassifier
1010

1111

12-
class NearestNeighborsTest(BaseTimedTestCase):
12+
class KNearestNeighborsTest(unittest.TestCase):
13+
1314
def test_kneighbors(self):
1415
""" Tests kneighbors against scikit-learn """
15-
x = np.random.random((1500, 5))
16-
data = ds.array(x, block_size=(500, 3))
17-
q_data = ds.array(x, block_size=(101, 2))
1816

19-
knn = NearestNeighbors(n_neighbors=10)
20-
knn.fit(data)
21-
dist, ind = knn.kneighbors(q_data)
17+
X, Y = make_classification(n_samples=200, n_features=5)
18+
x, y = ds.array(X, (50, 5)), ds.array(Y, (50, 1))
19+
20+
knn = KNeighborsClassifier(n_neighbors=3)
21+
knn.fit(x, y)
22+
ds_y_hat = knn.predict(x)
23+
knn.score(x, y)
2224

23-
sknn = SKNearestNeighbors(n_neighbors=10)
24-
sknn.fit(X=x)
25-
skdist, skind = sknn.kneighbors(X=x)
25+
sknn = skKNeighborsClassifier(n_neighbors=3)
26+
sknn.fit(X, Y)
27+
sk_y_hat = sknn.predict(X)
2628

27-
self.assertTrue(np.allclose(dist.collect(), skdist, atol=1e-7))
28-
self.assertTrue(np.array_equal(ind.collect(), skind))
29+
self.assertTrue(np.all(ds_y_hat.collect() == sk_y_hat))
2930

3031
def test_kneighbors_sparse(self):
3132
""" Tests kneighbors against scikit-learn with sparse data """
32-
x = csr_matrix(np.random.random((1500, 5)))
33-
data = ds.array(x, block_size=(500, 5))
33+
X, Y = make_classification(n_samples=200, n_features=5)
34+
X, Y = csr_matrix(X), Y
35+
x, y = ds.array(X, (50, 5)), ds.array(Y, (50, 1))
3436

35-
knn = NearestNeighbors(n_neighbors=10)
36-
knn.fit(data)
37-
dist, ind = knn.kneighbors(data)
37+
knn = KNeighborsClassifier(n_neighbors=3, weights='')
38+
knn.fit(x, y)
39+
ds_y_hat = knn.predict(x)
3840

39-
sknn = SKNearestNeighbors(n_neighbors=10)
40-
sknn.fit(X=x)
41-
skdist, skind = sknn.kneighbors(X=x)
41+
sknn = skKNeighborsClassifier(n_neighbors=3, weights='distance')
42+
sknn.fit(X, Y)
43+
sk_y_hat = sknn.predict(X)
4244

43-
self.assertTrue(np.allclose(dist.collect(), skdist, atol=1e-7))
44-
self.assertTrue(np.array_equal(ind.collect(), skind))
45+
self.assertTrue(np.all(ds_y_hat.collect() == sk_y_hat))
4546

4647

4748
def main():

0 commit comments

Comments
 (0)