diff --git a/selector/measures/similarity.py b/selector/measures/similarity.py index b7443592..ce4bb1c5 100644 --- a/selector/measures/similarity.py +++ b/selector/measures/similarity.py @@ -1,326 +1,651 @@ -# -*- coding: utf-8 -*- -# -# The Selector is a Python library of algorithms for selecting diverse -# subsets of data for machine-learning. -# -# Copyright (C) 2022-2024 The QC-Devs Community -# -# This file is part of Selector. -# -# Selector is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# Selector is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see -# -# -- -"""Similarity Module.""" - -from itertools import combinations_with_replacement - -import numpy as np - -__all__ = [ - "pairwise_similarity_bit", - "tanimoto", - "modified_tanimoto", - "scaled_similarity_matrix", -] - - -def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray: - """Compute pairwise similarity coefficient matrix. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Feature matrix of `n_samples` samples in `n_features` dimensional space. - metric : str - The metric used when calculating similarity coefficients between samples in a feature array. - Method for calculating similarity coefficient. Options: `"tanimoto"`, `"modified_tanimoto"`. - - Returns - ------- - s : ndarray of shape (n_samples, n_samples) - A symmetric similarity matrix between each pair of samples in the feature matrix. - The diagonal elements are directly computed instead of assuming that they are 1. - - """ - - available_methods = { - "tanimoto": tanimoto, - "modified_tanimoto": modified_tanimoto, - } - if metric not in available_methods: - raise ValueError( - f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}" - ) - if X.ndim != 2: - raise ValueError(f"Argument features should be a 2D array, got {X.ndim}") - - # make pairwise m-by-m similarity matrix - n_samples = len(X) - s = np.zeros((n_samples, n_samples)) - # compute similarity between all pairs of points (including the diagonal elements) - for i, j in combinations_with_replacement(range(n_samples), 2): - s[i, j] = s[j, i] = available_methods[metric](X[i], X[j]) - return s - - -def tanimoto(a: np.array, b: np.array) -> float: - r"""Compute Tanimoto coefficient or index (a.k.a. Jaccard similarity coefficient). - - For two binary or non-binary arrays :math:`A` and :math:`B`, Tanimoto coefficient - is defined as the size of their intersection divided by the size of their union: - - .. math:: - T(A, B) = \frac{| A \cap B|}{| A \cup B |} = - \frac{| A \cap B|}{|A| + |B| - | A \cap B|} = - \frac{A \cdot B}{\|A\|^2 + \|B\|^2 - A \cdot B} - - where :math:`A \cdot B = \sum_i{A_i B_i}` and :math:`\|A\|^2 = \sum_i{A_i^2}`. - - Parameters - ---------- - a : ndarray of shape (n_features,) - The 1D feature array of sample :math:`A` in an `n_features` dimensional space. - b : ndarray of shape (n_features,) - The 1D feature array of sample :math:`B` in an `n_features` dimensional space. - - Returns - ------- - coeff : float - Tanimoto coefficient between feature arrays :math:`A` and :math:`B`. - - Bajusz, D., Rácz, A., and Héberger, K.. (2015) - Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?. - Journal of Cheminformatics 7. - - """ - if a.ndim != 1 or b.ndim != 1: - raise ValueError(f"Arguments a and b should be 1D arrays, got {a.ndim} and {b.ndim}") - if a.shape != b.shape: - raise ValueError( - f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}" - ) - coeff = sum(a * b) / (sum(a**2) + sum(b**2) - sum(a * b)) - return coeff - - -def modified_tanimoto(a: np.array, b: np.array) -> float: - r"""Compute the modified tanimoto coefficient from bitstring vectors of data points A and B. - - Adjusts calculation of the Tanimoto coefficient to counter its natural bias towards - shorter vectors using a Bernoulli probability model. - - .. math:: - {mt} = \frac{2-p}{3} T_1 + \frac{1+p}{3} T_0 - - where :math:`p` is success probability of independent trials, - :math:`T_1` is the number of common '1' bits between data points - (:math:`T_1 = | A \cap B |`), and :math:`T_0` is the number of common '0' - bits between data points (:math:`T_0 = |(1-A) \cap (1-B)|`). - - - Parameters - ---------- - a : ndarray of shape (n_features,) - The 1D bitstring feature array of sample :math:`A` in an `n_features` dimensional space. - b : ndarray of shape (n_features,) - The 1D bitstring feature array of sample :math:`B` in an `n_features` dimensional space. - - Returns - ------- - mt : float - Modified tanimoto coefficient between bitstring feature arrays :math:`A` and :math:`B`. - - Notes - ----- - The equation above has been derived from - - .. math:: - {mt}_{\alpha} = {\alpha}T_1 + (1-\alpha)T_0 - - where :math:`\alpha = \frac{2-p}{3}`. This is done so that the expected value - of the modified tanimoto, :math:`E(mt)`, remains constant even as the number of - trials :math:`p` grows larger. - - Fligner, M. A., Verducci, J. S., and Blower, P. E.. (2002) - A Modification of the Jaccard-Tanimoto Similarity Index for - Diverse Selection of Chemical Compounds Using Binary Strings. - Technometrics 44, 110-119. - - """ - if a.ndim != 1: - raise ValueError(f"Argument `a` should have dimension 1 rather than {a.ndim}.") - if b.ndim != 1: - raise ValueError(f"Argument `b` should have dimension 1 rather than {b.ndim}.") - if a.shape != b.shape: - raise ValueError( - f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}" - ) - - n_features = len(a) - # number of common '1' bits between points A and B - n_11 = sum(a * b) - # number of common '0' bits between points A and B - n_00 = sum((1 - a) * (1 - b)) - - # calculate Tanimoto coefficient based on '0' bits - t_1 = 1 - if n_00 != n_features: - # bit strings are not all '0's - t_1 = n_11 / (n_features - n_00) - # calculate Tanimoto coefficient based on '1' bits - t_0 = 1 - if n_11 != n_features: - # bit strings are not all '1's - t_0 = n_00 / (n_features - n_11) - - # combine into modified tanimoto using Bernoulli Model - # p = independent success trials - # evaluated as total number of '1' bits - # divided by 2x the fingerprint length - p = (n_features - n_00 + n_11) / (2 * n_features) - # mt = x * T_1 + (1-x) * T_0 - # x = (2-p)/3 so that E(mt) = 1/3, no matter the value of p - mt = (((2 - p) / 3) * t_1) + (((1 + p) / 3) * t_0) - return mt - - -def scaled_similarity_matrix(X: np.array) -> np.ndarray: - r"""Compute the scaled similarity matrix. - - .. math:: - X(i,j) = \frac{X(i,j)}{\sqrt{X(i,i)X(j,j)}} - - Parameters - ---------- - X : ndarray of shape (n_samples, n_samples) - Similarity matrix of `n_samples`. - - Returns - ------- - s : ndarray of shape (n_samples, n_samples) - A scaled symmetric similarity matrix. - - """ - if X.ndim != 2: - raise ValueError(f"Argument similarity matrix should be a 2D array, got {X.ndim}") - - if X.shape[0] != X.shape[1]: - raise ValueError( - f"Argument similarity matrix should be a square matrix (having same number of rows and columns), got {X.shape[0]} and {X.shape[1]}" - ) - - if not (np.all(X >= 0) and np.all(np.diag(X) > 0)): - raise ValueError( - "All elements of similarity matrix should be greater than zero and diagonals should be non-zero" - ) - - # scaling does not happen if the matrix is binary similarity matrix with all diagonal elements as 1 - if np.all(np.diag(X) == 1): - print("No scaling is taking effect") - return X - else: - # make a scaled similarity matrix - n_samples = len(X) - s = np.zeros((n_samples, n_samples)) - # calculate the square root of the diagonal elements - sqrt_diag = np.sqrt(np.diag(X)) - # calculate the product of the square roots of the diagonal elements - product_sqrt_diag = np.outer(sqrt_diag, sqrt_diag) - # divide each element of the matrix by the product of the square roots of diagonal elements - s = X / product_sqrt_diag - return s - - -def similarity_index(x: np.array, y: np.array, sim_index: str) -> float: - """Compute similarity index matrix. - - Parameters - ---------- - x : ndarray of shape (n_features,) - Feature array of sample `x` in an `n_features` dimensional space - y : ndarray of shape (n_features,) - Feature array of sample `y` in an `n_features` dimensional space - sim_index : str, optional - The key with the abbreviation of the similarity index to be used for calculations. - Possible values are: - - 'AC': Austin-Colwell - - 'BUB': Baroni-Urbani-Buser - - 'CTn': Consoni-Todschini n (n=1,2) - - 'Fai': Faith - - 'Gle': Gleason - - 'Ja': Jaccard - - 'JT': Jaccard-Tanimoto - - 'RT': Rogers-Tanimoto - - 'RR': Russel-Rao - - 'SM': Sokal-Michener - - 'SSn': Sokal-Sneath n (n=1,2) - Default is 'RR'. - - Returns - ------- - sim : float - The similarity index value between the feature arrays `x` and `y`. - """ - # Define the similarity index functions - similarity_indices = { - "AC": lambda a, d, dis, p: 2 / np.pi * np.arcsin(((a + d) / p) ** 0.5), - "BUB": lambda a, d, dis, p: ((a * d) ** 0.5 + a) / ((a * d) ** 0.5 + a + dis), - "CT1": lambda a, d, dis, p: np.log(1 + a + d) / np.log(1 + p), - "CT2": lambda a, d, dis, p: (np.log(1 + p) - np.log(1 + dis)) / np.log(1 + p), - "Fai": lambda a, d, dis, p: (a + 0.5 * d) / p, - "Gle": lambda a, d, dis, p: 2 * a / (2 * a + dis), - "Ja": lambda a, d, dis, p: 3 * a / (3 * a + dis), - "JT": lambda a, d, dis, p: a / (a + dis), - "RT": lambda a, d, dis, p: (a + d) / (p + dis), - "RR": lambda a, d, dis, p: a / p, - "SM": lambda a, d, dis, p: (a + d) / p, - "SS1": lambda a, d, dis, p: a / (a + 2 * dis), - "SS2": lambda a, d, dis, p: (2 * (a + d)) / (p + (a + d)), - } - - if sim_index not in similarity_indices: - raise ValueError( - f"Argument sim_index={sim_index} is not recognized! Choose from {similarity_indices.keys()}" - ) - if x.ndim != 1 or y.ndim != 1: - raise ValueError(f"Arguments x and y should be 1D arrays, got {x.ndim} and {y.ndim}") - if x.shape != y.shape: - raise ValueError( - f"Arguments x and y should have the same shape, got {x.shape} != {y.shape}" - ) - a, d, dis, p = _compute_base_descriptors(x, y) - return similarity_indices[sim_index](a, d, dis, p) - - -def _compute_base_descriptors(x, y): - """Compute the base descriptors for the similarity indices. - - Parameters - ---------- - x : ndarray of shape (n_features,) - Feature array of sample `x` in an `n_features` dimensional space - y : ndarray of shape (n_features,) - Feature array of sample `y` in an `n_features` dimensional space - - Returns - ------- - tuple(int, int, int, int) - The number of common on bits, number of common off bits, number of 1-0 mismatches, and the - length of the fingerprint. - """ - p = len(x) - a = np.dot(x, y) - d = np.dot(1 - x, 1 - y) - dis = p - a - d - return a, d, dis, p +# -*- coding: utf-8 -*- + +# + +# The Selector is a Python library of algorithms for selecting diverse + +# subsets of data for machine-learning. + +# + +# Copyright (C) 2022-2024 The QC-Devs Community + +# + +# This file is part of Selector. + +# + +# Selector is free software; you can redistribute it and/or + +# modify it under the terms of the GNU General Public License + +# as published by the Free Software Foundation; either version 3 + +# of the License, or (at your option) any later version. + +# + +# Selector is distributed in the hope that it will be useful, + +# but WITHOUT ANY WARRANTY; without even the implied warranty of + +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + +# GNU General Public License for more details. + +# + +# You should have received a copy of the GNU General Public License + +# along with this program; if not, see + +# + +# -- + +"""Similarity Module.""" + + + +from itertools import combinations_with_replacement + + + +import numpy as np + + + +__all__ = [ + + "pairwise_similarity_bit", + + "tanimoto", + + "modified_tanimoto", + + "scaled_similarity_matrix", + +] + + + + + +def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray: + + """Compute pairwise similarity coefficient matrix. + + + + Parameters + + ---------- + + X : ndarray of shape (n_samples, n_features) + + Feature matrix of `n_samples` samples in `n_features` dimensional space. + + metric : str + + The metric used when calculating similarity coefficients between samples in a feature array. + + Method for calculating similarity coefficient. Options: `"tanimoto"`, `"modified_tanimoto"`. + + + + Returns + + ------- + + s : ndarray of shape (n_samples, n_samples) + + A symmetric similarity matrix between each pair of samples in the feature matrix. + + The diagonal elements are directly computed instead of assuming that they are 1. + + + + """ + + + + available_methods = { + + "tanimoto": tanimoto, + + "modified_tanimoto": modified_tanimoto, + + } + + if metric not in available_methods: + + raise ValueError( + + f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}" + + ) + + if X.ndim != 2: + + raise ValueError(f"Argument features should be a 2D array, got {X.ndim}") + + + + # make pairwise m-by-m similarity matrix + + n_samples = len(X) + + s = np.zeros((n_samples, n_samples)) + + # compute similarity between all pairs of points (including the diagonal elements) + + for i, j in combinations_with_replacement(range(n_samples), 2): + + s[i, j] = s[j, i] = available_methods[metric](X[i], X[j]) + + return s + + + + + +def tanimoto(a: np.array, b: np.array) -> float: + + r"""Compute Tanimoto coefficient or index (a.k.a. Jaccard similarity coefficient). + + + + For two binary or non-binary arrays :math:`A` and :math:`B`, Tanimoto coefficient + + is defined as the size of their intersection divided by the size of their union: + + + + .. math:: + + T(A, B) = \frac{| A \cap B|}{| A \cup B |} = + + \frac{| A \cap B|}{|A| + |B| - | A \cap B|} = + + \frac{A \cdot B}{\|A\|^2 + \|B\|^2 - A \cdot B} + + + + where :math:`A \cdot B = \sum_i{A_i B_i}` and :math:`\|A\|^2 = \sum_i{A_i^2}`. + + + + Parameters + + ---------- + + a : ndarray of shape (n_features,) + + The 1D feature array of sample :math:`A` in an `n_features` dimensional space. + + b : ndarray of shape (n_features,) + + The 1D feature array of sample :math:`B` in an `n_features` dimensional space. + + + + Returns + + ------- + + coeff : float + + Tanimoto coefficient between feature arrays :math:`A` and :math:`B`. + + + + Bajusz, D., Rácz, A., and Héberger, K.. (2015) + + Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?. + + Journal of Cheminformatics 7. + + + + """ + + if a.ndim != 1 or b.ndim != 1: + + raise ValueError(f"Arguments a and b should be 1D arrays, got {a.ndim} and {b.ndim}") + + if a.shape != b.shape: + + raise ValueError( + + f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}" + + ) + + coeff = sum(a * b) / (sum(a**2) + sum(b**2) - sum(a * b)) + + return coeff + + + + + +def modified_tanimoto(a: np.array, b: np.array) -> float: + + r"""Compute the modified tanimoto coefficient from bitstring vectors of data points A and B. + + + + Adjusts calculation of the Tanimoto coefficient to counter its natural bias towards + + shorter vectors using a Bernoulli probability model. + + + + .. math:: + + {mt} = \frac{2-p}{3} T_1 + \frac{1+p}{3} T_0 + + + + where :math:`p` is success probability of independent trials, + + :math:`T_1` is the number of common '1' bits between data points + + (:math:`T_1 = | A \cap B |`), and :math:`T_0` is the number of common '0' + + bits between data points (:math:`T_0 = |(1-A) \cap (1-B)|`). + + + + + + Parameters + + ---------- + + a : ndarray of shape (n_features,) + + The 1D bitstring feature array of sample :math:`A` in an `n_features` dimensional space. + + b : ndarray of shape (n_features,) + + The 1D bitstring feature array of sample :math:`B` in an `n_features` dimensional space. + + + + Returns + + ------- + + mt : float + + Modified tanimoto coefficient between bitstring feature arrays :math:`A` and :math:`B`. + + + + Notes + + ----- + + The equation above has been derived from + + + + .. math:: + + {mt}_{\alpha} = {\alpha}T_1 + (1-\alpha)T_0 + + + + where :math:`\alpha = \frac{2-p}{3}`. This is done so that the expected value + + of the modified tanimoto, :math:`E(mt)`, remains constant even as the number of + + trials :math:`p` grows larger. + + + + Fligner, M. A., Verducci, J. S., and Blower, P. E.. (2002) + + A Modification of the Jaccard-Tanimoto Similarity Index for + + Diverse Selection of Chemical Compounds Using Binary Strings. + + Technometrics 44, 110-119. + + + + """ + + if a.ndim != 1: + + raise ValueError(f"Argument `a` should have dimension 1 rather than {a.ndim}.") + + if b.ndim != 1: + + raise ValueError(f"Argument `b` should have dimension 1 rather than {b.ndim}.") + + if a.shape != b.shape: + + raise ValueError( + + f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}" + + ) + + + + n_features = len(a) + + # number of common '1' bits between points A and B + + n_11 = sum(a * b) + + # number of common '0' bits between points A and B + + n_00 = sum((1 - a) * (1 - b)) + + + + # calculate Tanimoto coefficient based on '0' bits + + t_1 = 1 + + if n_00 != n_features: + + # bit strings are not all '0's + + t_1 = n_11 / (n_features - n_00) + + # calculate Tanimoto coefficient based on '1' bits + + t_0 = 1 + + if n_11 != n_features: + + # bit strings are not all '1's + + t_0 = n_00 / (n_features - n_11) + + + + # combine into modified tanimoto using Bernoulli Model + + # p = independent success trials + + # evaluated as total number of '1' bits + + # divided by 2x the fingerprint length + + p = (n_features - n_00 + n_11) / (2 * n_features) + + # mt = x * T_1 + (1-x) * T_0 + + # x = (2-p)/3 so that E(mt) = 1/3, no matter the value of p + + mt = (((2 - p) / 3) * t_1) + (((1 + p) / 3) * t_0) + + return mt + + + + + +def scaled_similarity_matrix(X: np.array) -> np.ndarray: + + r"""Compute the scaled similarity matrix. + + + + .. math:: + + X(i,j) = \frac{X(i,j)}{\sqrt{X(i,i)X(j,j)}} + + + + Parameters + + ---------- + + X : ndarray of shape (n_samples, n_samples) + + Similarity matrix of `n_samples`. + + + + Returns + + ------- + + s : ndarray of shape (n_samples, n_samples) + + A scaled symmetric similarity matrix. + + + + """ + + if X.ndim != 2: + + raise ValueError(f"Argument similarity matrix should be a 2D array, got {X.ndim}") + + + + if X.shape[0] != X.shape[1]: + + raise ValueError( + + f"Argument similarity matrix should be a square matrix (having same number of rows and columns), got {X.shape[0]} and {X.shape[1]}" + + ) + + + + if not (np.all(X >= 0) and np.all(np.diag(X) > 0)): + + raise ValueError( + + "All elements of similarity matrix should be greater than zero and diagonals should be non-zero" + + ) + + + + # scaling does not happen if the matrix is binary similarity matrix with all diagonal elements as 1 + + if np.all(np.diag(X) == 1): + + print("No scaling is taking effect") + + return X + + else: + + # make a scaled similarity matrix + + n_samples = len(X) + + s = np.zeros((n_samples, n_samples)) + + # calculate the square root of the diagonal elements + + sqrt_diag = np.sqrt(np.diag(X)) + + # calculate the product of the square roots of the diagonal elements + + product_sqrt_diag = np.outer(sqrt_diag, sqrt_diag) + + # divide each element of the matrix by the product of the square roots of diagonal elements + + s = X / product_sqrt_diag + + return s + + + + + +def similarity_index(x: np.array, y: np.array, sim_index: str) -> float: + + """Compute similarity index matrix. + + + + Parameters + + ---------- + + x : ndarray of shape (n_features,) + + Feature array of sample `x` in an `n_features` dimensional space + + y : ndarray of shape (n_features,) + + Feature array of sample `y` in an `n_features` dimensional space + + sim_index : str, optional + + The key with the abbreviation of the similarity index to be used for calculations. + + Possible values are: + + - 'AC': Austin-Colwell + + - 'BUB': Baroni-Urbani-Buser + + - 'CTn': Consoni-Todschini n (n=1,2) + + - 'Fai': Faith + + - 'Gle': Gleason + + - 'Ja': Jaccard + + - 'JT': Jaccard-Tanimoto + + - 'RT': Rogers-Tanimoto + + - 'RR': Russel-Rao + + - 'SM': Sokal-Michener + + - 'SSn': Sokal-Sneath n (n=1,2) + + Default is 'RR'. + + + + Returns + + ------- + + sim : float + + The similarity index value between the feature arrays `x` and `y`. + + """ + + # Define the similarity index functions + + similarity_indices = { + + "AC": lambda a, d, dis, p: 2 / np.pi * np.arcsin(((a + d) / p) ** 0.5), + + "BUB": lambda a, d, dis, p: ((a * d) ** 0.5 + a) / ((a * d) ** 0.5 + a + dis), + + "CT1": lambda a, d, dis, p: np.log(1 + a + d) / np.log(1 + p), + + "CT2": lambda a, d, dis, p: (np.log(1 + p) - np.log(1 + dis)) / np.log(1 + p), + + "Fai": lambda a, d, dis, p: (a + 0.5 * d) / p, + + "Gle": lambda a, d, dis, p: 2 * a / (2 * a + dis), + + "Ja": lambda a, d, dis, p: 3 * a / (3 * a + dis), + + "JT": lambda a, d, dis, p: a / (a + dis), + + "RT": lambda a, d, dis, p: (a + d) / (p + dis), + + "RR": lambda a, d, dis, p: a / p, + + "SM": lambda a, d, dis, p: (a + d) / p, + + "SS1": lambda a, d, dis, p: a / (a + 2 * dis), + + "SS2": lambda a, d, dis, p: (2 * (a + d)) / (p + (a + d)), + + } + + + + if sim_index not in similarity_indices: + + raise ValueError( + + f"Argument sim_index={sim_index} is not recognized! Choose from {similarity_indices.keys()}" + + ) + + if x.ndim != 1 or y.ndim != 1: + + raise ValueError(f"Arguments x and y should be 1D arrays, got {x.ndim} and {y.ndim}") + + if x.shape != y.shape: + + raise ValueError( + + f"Arguments x and y should have the same shape, got {x.shape} != {y.shape}" + + ) + + a, d, dis, p = _compute_base_descriptors(x, y) + + return similarity_indices[sim_index](a, d, dis, p) + + + + + +def _compute_base_descriptors(x, y): + + """Compute the base descriptors for the similarity indices. + + + + Parameters + + ---------- + + x : ndarray of shape (n_features,) + + Feature array of sample `x` in an `n_features` dimensional space + + y : ndarray of shape (n_features,) + + Feature array of sample `y` in an `n_features` dimensional space + + + + Returns + + ------- + + tuple(int, int, int, int) + + The number of common on bits, number of common off bits, number of 1-0 mismatches, and the + + length of the fingerprint. + + """ + + p = len(x) + + a = np.dot(x, y) + + d = np.dot(1 - x, 1 - y) + + dis = p - a - d + + return a, d, dis, p \ No newline at end of file