|
| 1 | +import numpy as np |
| 2 | +from pycompss.api.api import compss_wait_on |
| 3 | +from pycompss.api.constraint import constraint |
| 4 | +from pycompss.api.parameter import COLLECTION_IN, COLLECTION_OUT, Depth, Type |
| 5 | +from pycompss.api.task import task |
| 6 | +from sklearn.base import BaseEstimator |
| 7 | +from dislib.data.array import Array |
| 8 | +from dislib.neighbors import NearestNeighbors |
| 9 | +from sklearn.metrics import accuracy_score |
| 10 | + |
| 11 | +from collections import defaultdict |
| 12 | + |
| 13 | + |
| 14 | +class KNeighborsClassifier(BaseEstimator): |
| 15 | + """Classifier implementing the k-nearest neighbors vote. |
| 16 | + Parameters |
| 17 | + ---------- |
| 18 | + n_neighbors : int, default=5 |
| 19 | + Number of neighbors to use by default for :meth:`kneighbors` queries. |
| 20 | + weights : {'uniform', 'distance'} or callable, default='uniform' |
| 21 | + Weight function used in prediction. Possible values: |
| 22 | + - 'uniform' : uniform weights. All points in each neighborhood |
| 23 | + are weighted equally. |
| 24 | + - 'distance' : weight points by the inverse of their distance. |
| 25 | + in this case, closer neighbors of a query point will have a |
| 26 | + greater influence than neighbors which are further away. |
| 27 | + - [callable] : a user-defined function which accepts an |
| 28 | + array of distances, and returns an array of the same shape |
| 29 | + containing the weights. |
| 30 | + random_state : int, RandomState instance or None, optional (default=None) |
| 31 | + The seed of the pseudo random number generator used when shuffling the |
| 32 | + data for probability estimates. If int, random_state is the seed used |
| 33 | + by the random number generator; If RandomState instance, random_state |
| 34 | + is the random number generator; If None, the random number generator is |
| 35 | + the RandomState instance used by np.random. |
| 36 | + Notes |
| 37 | + ----- |
| 38 | + See :ref:`Nearest Neighbors <neighbors>` in the online documentation |
| 39 | + for a discussion of the choice of ``algorithm`` and ``leaf_size``. |
| 40 | + .. warning:: |
| 41 | + Regarding the Nearest Neighbors algorithms, if it is found that two |
| 42 | + neighbors, neighbor `k+1` and `k`, have identical distances |
| 43 | + but different labels, the results will depend on the ordering of the |
| 44 | + training data. |
| 45 | + https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm |
| 46 | + Examples |
| 47 | + -------- |
| 48 | + >>> import dislib as ds |
| 49 | + >>> from dislib.classification import KNeighborsClassifier |
| 50 | + >>> import numpy as np |
| 51 | + >>> |
| 52 | + >>> |
| 53 | + >>> if __name__ == '__main__': |
| 54 | + >>> x = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) |
| 55 | + >>> y = np.array([1, 1, 2, 2]) |
| 56 | + >>> train_data = ds.array(x, block_size=(4, 2)) |
| 57 | + >>> train_labels = ds.array(y, block_size=(1, 2)) |
| 58 | + >>> knn = KNeighborsClassifier(n_neighbors=3) |
| 59 | + >>> knn.fit(train_data, train_labels) |
| 60 | + >>> test_data = ds.array(np.array([[-0.8, -1]]), block_size=(1, 2)) |
| 61 | + >>> y_pred = knn.predict(test_data) |
| 62 | + >>> print(y_pred) |
| 63 | + """ |
| 64 | + |
| 65 | + def __init__(self, n_neighbors: int = 5, weights: str = 'uniform', |
| 66 | + random_state=None): |
| 67 | + |
| 68 | + self.n_neighbors = n_neighbors |
| 69 | + self.weights = weights |
| 70 | + self.random_state = random_state |
| 71 | + self.nn = NearestNeighbors(n_neighbors) |
| 72 | + |
| 73 | + def fit(self, x: Array, y: Array): |
| 74 | + """ Fit the model using training data. |
| 75 | +
|
| 76 | + Parameters |
| 77 | + ---------- |
| 78 | + x : ds-array, shape=(n_samples, n_features) |
| 79 | + Training data. |
| 80 | + y : ds-array, shape=(n_samples, 1) |
| 81 | + Class labels of x. |
| 82 | + Returns |
| 83 | + ------- |
| 84 | + self : KNeighborsClassifier |
| 85 | + """ |
| 86 | + self.y = y |
| 87 | + self.nn.fit(x) |
| 88 | + |
| 89 | + return self |
| 90 | + |
| 91 | + def predict(self, q: Array): |
| 92 | + """ Perform classification on samples. |
| 93 | +
|
| 94 | + Parameters |
| 95 | + ---------- |
| 96 | + x : ds-array, shape=(n_samples, n_features) |
| 97 | + Input samples. |
| 98 | +
|
| 99 | + Returns |
| 100 | + ------- |
| 101 | + y : ds-array, shape(n_samples, 1) |
| 102 | + Class labels of x. |
| 103 | + """ |
| 104 | + dist, ind = self.nn.kneighbors(q) |
| 105 | + |
| 106 | + out_blocks = Array._get_out_blocks(self.y._n_blocks) |
| 107 | + |
| 108 | + _indices_to_classes(ind._blocks, self.y._blocks, dist._blocks, |
| 109 | + out_blocks, self.weights) |
| 110 | + |
| 111 | + return Array(blocks=out_blocks, top_left_shape=self.y._top_left_shape, |
| 112 | + reg_shape=self.y._reg_shape, |
| 113 | + shape=self.y.shape, sparse=False) |
| 114 | + |
| 115 | + def score(self, q: Array, y: Array, collect=False): |
| 116 | + """ |
| 117 | + Returns the mean accuracy on the given test data and labels. |
| 118 | +
|
| 119 | + Parameters |
| 120 | + ---------- |
| 121 | + x : ds-array, shape=(n_samples, n_features) |
| 122 | + Test samples. |
| 123 | + y : ds-array, shape=(n_samples, 1) |
| 124 | + True labels for x. |
| 125 | + collect : bool, optional (default=False) |
| 126 | + When True, a synchronized result is returned. |
| 127 | +
|
| 128 | + Returns |
| 129 | + ------- |
| 130 | + score : float (as future object) |
| 131 | + Mean accuracy of self.predict(x) wrt. y. |
| 132 | + """ |
| 133 | + |
| 134 | + y_pred = self.predict(q) |
| 135 | + score = _get_score(y._blocks, y_pred._blocks) |
| 136 | + |
| 137 | + return compss_wait_on(score) if collect else score |
| 138 | + |
| 139 | + |
| 140 | +@constraint(computing_units="${ComputingUnits}") |
| 141 | +@task(ind_blocks={Type: COLLECTION_IN, Depth: 2}, |
| 142 | + y_blocks={Type: COLLECTION_IN, Depth: 2}, |
| 143 | + dist_blocks={Type: COLLECTION_IN, Depth: 2}, |
| 144 | + out_blocks={Type: COLLECTION_OUT, Depth: 2}) |
| 145 | +def _indices_to_classes(ind_blocks, y_blocks, dist_blocks, |
| 146 | + out_blocks, weights): |
| 147 | + ind = Array._merge_blocks(ind_blocks) |
| 148 | + y = Array._merge_blocks(y_blocks).flatten() |
| 149 | + dist = Array._merge_blocks(dist_blocks) |
| 150 | + |
| 151 | + classes = y[ind] |
| 152 | + |
| 153 | + final_class = [] |
| 154 | + for crow, drow in zip(classes, dist): |
| 155 | + d = defaultdict(int) |
| 156 | + crow = crow.flatten() |
| 157 | + for j in range(ind.shape[1]): |
| 158 | + |
| 159 | + if weights == 'uniform': |
| 160 | + w = 1 |
| 161 | + else: |
| 162 | + w = (drow[j] + np.finfo(drow.dtype).eps) |
| 163 | + |
| 164 | + d[crow[j]] += 1/w |
| 165 | + |
| 166 | + final_class.append(max(d, key=d.get)) |
| 167 | + |
| 168 | + blocks = np.array_split(final_class, len(y_blocks)) |
| 169 | + |
| 170 | + for i in range(len(y_blocks)): |
| 171 | + out_blocks[i][0] = np.expand_dims(blocks[i][:], axis=1) |
| 172 | + |
| 173 | + |
| 174 | +@constraint(computing_units="${ComputingUnits}") |
| 175 | +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, |
| 176 | + ypred_blocks={Type: COLLECTION_IN, Depth: 2}, |
| 177 | + returns=float) |
| 178 | +def _get_score(y_blocks, ypred_blocks): |
| 179 | + y = Array._merge_blocks(y_blocks).flatten() |
| 180 | + y_pred = Array._merge_blocks(ypred_blocks).flatten() |
| 181 | + |
| 182 | + return accuracy_score(y, y_pred) |
0 commit comments