Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| This function is adapted from [pyod] by [yzhao062] | |
| Original source: [https://github.com/yzhao062/pyod] | |
| """ | |
| from __future__ import division | |
| from __future__ import print_function | |
| import warnings | |
| from operator import itemgetter | |
| import numpy as np | |
| from scipy.spatial import distance_matrix | |
| from scipy.spatial import minkowski_distance | |
| from sklearn.utils import check_array | |
| from .base import BaseDetector | |
| from ..utils.utility import check_parameter | |
| class COF(BaseDetector): | |
| """Connectivity-Based Outlier Factor (COF) COF uses the ratio of average | |
| chaining distance of data point and the average of average chaining | |
| distance of k nearest neighbor of the data point, as the outlier score | |
| for observations. | |
| See :cite:`tang2002enhancing` for details. | |
| Two version of COF are supported: | |
| - Fast COF: computes the entire pairwise distance matrix at the cost of a | |
| O(n^2) memory requirement. | |
| - Memory efficient COF: calculates pairwise distances incrementally. | |
| Use this implementation when it is not feasible to fit the n-by-n | |
| distance in memory. This leads to a linear overhead because many | |
| distances will have to be recalculated. | |
| Parameters | |
| ---------- | |
| contamination : float in (0., 0.5), optional (default=0.1) | |
| The amount of contamination of the data set, i.e. | |
| the proportion of outliers in the data set. Used when fitting to | |
| define the threshold on the decision function. | |
| n_neighbors : int, optional (default=20) | |
| Number of neighbors to use by default for k neighbors queries. | |
| Note that n_neighbors should be less than the number of samples. | |
| If n_neighbors is larger than the number of samples provided, | |
| all samples will be used. | |
| method : string, optional (default='fast') | |
| Valid values for method are: | |
| - 'fast' Fast COF, computes the full pairwise distance matrix up front. | |
| - 'memory' Memory-efficient COF, computes pairwise distances only when | |
| needed at the cost of computational speed. | |
| Attributes | |
| ---------- | |
| decision_scores_ : numpy array of shape (n_samples,) | |
| The outlier scores of the training data. | |
| The higher, the more abnormal. Outliers tend to have higher | |
| scores. This value is available once the detector is | |
| fitted. | |
| threshold_ : float | |
| The threshold is based on ``contamination``. It is the | |
| ``n_samples * contamination`` most abnormal samples in | |
| ``decision_scores_``. The threshold is calculated for generating | |
| binary outlier labels. | |
| labels_ : int, either 0 or 1 | |
| The binary labels of the training data. 0 stands for inliers | |
| and 1 for outliers/anomalies. It is generated by applying | |
| ``threshold_`` on ``decision_scores_``. | |
| n_neighbors_: int | |
| Number of neighbors to use by default for k neighbors queries. | |
| """ | |
| def __init__(self, contamination=0.1, n_neighbors=20, method="fast"): | |
| super(COF, self).__init__(contamination=contamination) | |
| if isinstance(n_neighbors, int): | |
| check_parameter(n_neighbors, low=1, param_name='n_neighbors') | |
| else: | |
| raise TypeError( | |
| "n_neighbors should be int. Got %s" % type(n_neighbors)) | |
| self.n_neighbors = n_neighbors | |
| self.method = method | |
| def fit(self, X, y=None): | |
| """Fit detector. y is ignored in unsupervised methods. | |
| Parameters | |
| ---------- | |
| X : numpy array of shape (n_samples, n_features) | |
| The input samples. | |
| y : Ignored | |
| Not used, present for API consistency by convention. | |
| Returns | |
| ------- | |
| self : object | |
| Fitted estimator. | |
| """ | |
| X = check_array(X) | |
| self.n_train_ = X.shape[0] | |
| self.n_neighbors_ = self.n_neighbors | |
| if self.n_neighbors_ >= self.n_train_: | |
| self.n_neighbors_ = self.n_train_ - 1 | |
| warnings.warn( | |
| "n_neighbors is set to the number of training points " | |
| "minus 1: {0}".format(self.n_neighbors_)) | |
| check_parameter(self.n_neighbors_, 1, self.n_train_, | |
| include_left=True, include_right=True) | |
| self._set_n_classes(y) | |
| self.decision_scores_ = self.decision_function(X) | |
| self._process_decision_scores() | |
| return self | |
| def decision_function(self, X): | |
| """Predict raw anomaly score of X using the fitted detector. | |
| The anomaly score of an input sample is computed based on different | |
| detector algorithms. For consistency, outliers are assigned with | |
| larger anomaly scores. | |
| Parameters | |
| ---------- | |
| X : numpy array of shape (n_samples, n_features) | |
| The training input samples. Sparse matrices are accepted only | |
| if they are supported by the base estimator. | |
| Returns | |
| ------- | |
| anomaly_scores : numpy array of shape (n_samples,) | |
| The anomaly score of the input samples. | |
| """ | |
| if self.method.lower() == "fast": | |
| return self._cof_fast(X) | |
| elif self.method.lower() == "memory": | |
| return self._cof_memory(X) | |
| else: | |
| raise ValueError("method should be set to either \'fast\' or \'memory\'. Got %s" % self.method) | |
| def _cof_memory(self, X): | |
| """ | |
| Connectivity-Based Outlier Factor (COF) Algorithm | |
| This function is called internally to calculate the | |
| Connectivity-Based Outlier Factor (COF) as an outlier | |
| score for observations. | |
| This function uses a memory efficient implementation at the cost of | |
| speed. | |
| :return: numpy array containing COF scores for observations. | |
| The greater the COF, the greater the outlierness. | |
| """ | |
| #dist_matrix = np.array(distance_matrix(X, X)) | |
| sbn_path_index = np.zeros((X.shape[0],self.n_neighbors_), dtype=np.int64) | |
| ac_dist, cof_ = np.zeros((X.shape[0])), np.zeros((X.shape[0])) | |
| for i in range(X.shape[0]): | |
| #sbn_path = np.argsort(dist_matrix[i]) | |
| sbn_path = np.argsort(minkowski_distance(X[i,:],X,p=2)) | |
| sbn_path_index[i,:] = sbn_path[1: self.n_neighbors_ + 1] | |
| cost_desc = np.zeros((self.n_neighbors_)) | |
| for j in range(self.n_neighbors_): | |
| #cost_desc.append( | |
| # np.min(dist_matrix[sbn_path[j + 1]][sbn_path][:j + 1])) | |
| cost_desc[j] = np.min(minkowski_distance(X[sbn_path[j + 1]],X,p=2)[sbn_path][:j + 1]) | |
| acd = np.zeros((self.n_neighbors_)) | |
| for _h, cost_ in enumerate(cost_desc): | |
| neighbor_add1 = self.n_neighbors_ + 1 | |
| acd[_h] = ((2. * (neighbor_add1 - (_h + 1))) / (neighbor_add1 * self.n_neighbors_)) * cost_ | |
| ac_dist[i] = np.sum(acd) | |
| for _g in range(X.shape[0]): | |
| cof_[_g] = (ac_dist[_g] * self.n_neighbors_) / np.sum(ac_dist[sbn_path_index[_g]]) | |
| return np.nan_to_num(cof_) | |
| def _cof_fast(self, X): | |
| """ | |
| Connectivity-Based Outlier Factor (COF) Algorithm | |
| This function is called internally to calculate the | |
| Connectivity-Based Outlier Factor (COF) as an outlier | |
| score for observations. | |
| This function uses a fast implementation at the cost of memory. | |
| :return: numpy array containing COF scores for observations. | |
| The greater the COF, the greater the outlierness. | |
| """ | |
| dist_matrix = np.array(distance_matrix(X, X)) | |
| sbn_path_index, ac_dist, cof_ = [], [], [] | |
| for i in range(X.shape[0]): | |
| sbn_path = np.argsort(dist_matrix[i]) | |
| sbn_path_index.append(sbn_path[1: self.n_neighbors_ + 1]) | |
| cost_desc = [] | |
| for j in range(self.n_neighbors_): | |
| cost_desc.append( | |
| np.min(dist_matrix[sbn_path[j + 1]][sbn_path][:j + 1])) | |
| acd = [] | |
| for _h, cost_ in enumerate(cost_desc): | |
| neighbor_add1 = self.n_neighbors_ + 1 | |
| acd.append(((2. * (neighbor_add1 - (_h + 1))) / ( | |
| neighbor_add1 * self.n_neighbors_)) * cost_) | |
| ac_dist.append(np.sum(acd)) | |
| for _g in range(X.shape[0]): | |
| cof_.append((ac_dist[_g] * self.n_neighbors_) / | |
| np.sum(itemgetter(*sbn_path_index[_g])(ac_dist))) | |
| return np.nan_to_num(cof_) |